This jupyter notebook would be imported in the `homework_main-basic.ipynb` and `homework_main-advanced.ipynb` using `%run homework_modules.ipynb` command.
So each cell of this notebook would be call.

Thus, If you have some tests falling, just commit them out to check a functionality.

In [1]:
import numpy as np

In [2]:
# FOR TESTS ONLY!

import torch
from torch.autograd import Variable
import numpy
import traceback

VERBOSE = True

def assertAlmostEqual(expected, actual, msg: str="???", rtol=1e-05, atol=1e-08,):
    try:
        isEq = np.allclose(expected, actual, rtol=rtol, atol=atol)
    except Exception as err:
        print(f"{msg}: FAILED:\n  {err}")
        raise

    if isEq:
        if VERBOSE:
            print(f"{msg}: OK!")
    else:
        print(f"{msg}: FAILED:\n  expected={expected}\n  actual={actual}")
        raise AssertionError(f"{msg}: FAILED:\n  expected={expected}\  actual={actual}")
        
def assertTrue(condition, msg: str="???"):
    if condition:
        if VERBOSE:
            print(f"{msg}: OK!")
    else:
        print(f"{msg}: FAILED:\n  expected=True")
        raise AssertionError(f"{msg}: FAILED:\n  expected=True")

**Module** is an abstract class which defines fundamental methods necessary for a training a neural network. You do not need to change anything here, just read the comments.

In [3]:
class Module(object):
    """
    Basically, you can think of a module as of a something (black box) 
    which can process `input` data and produce `ouput` data.
    This is like applying a function which is called `forward`: 
        
        output = module.forward(input)
    
    The module should be able to perform a backward pass: to differentiate the `forward` function. 
    More, it should be able to differentiate it if is a part of chain (chain rule).
    The latter implies there is a gradient from previous step of a chain rule. 
    
        gradInput = module.backward(input, gradOutput)
    """
    def __init__ (self):
        self.output = None
        self.gradInput = None
        self.training = True
    
    def forward(self, input):
        """
        Takes an input object, and computes the corresponding output of the module.
        """
        return self.updateOutput(input)

    def backward(self,input, gradOutput):
        """
        Performs a backpropagation step through the module, with respect to the given input.
        
        This includes 
         - computing a gradient w.r.t. `input` (is needed for further backprop),
         - computing a gradient w.r.t. parameters (to update parameters while optimizing).
        """
        self.updateGradInput(input, gradOutput)
        self.accGradParameters(input, gradOutput)
        return self.gradInput
    

    def updateOutput(self, input):
        """
        Computes the output using the current parameter set of the class and input.
        This function returns the result which is stored in the `output` field.
        
        Make sure to both store the data in `output` field and return it. 
        """
        
        # The easiest case:
            
        # self.output = input 
        # return self.output
        
        pass

    def updateGradInput(self, input, gradOutput):
        """
        Computing the gradient of the module with respect to its own input. 
        This is returned in `gradInput`. Also, the `gradInput` state variable is updated accordingly.
        
        The shape of `gradInput` is always the same as the shape of `input`.
        
        Make sure to both store the gradients in `gradInput` field and return it.
        """
        
        # The easiest case:
        
        # self.gradInput = gradOutput 
        # return self.gradInput
        
        pass   
    
    def accGradParameters(self, input, gradOutput):
        """
        Computing the gradient of the module with respect to its own parameters.
        No need to override if module has no parameters (e.g. ReLU).
        """
        pass
    
    def zeroGradParameters(self): 
        """
        Zeroes `gradParams` variable if the module has params.
        """
        pass
        
    def getParameters(self):
        """
        Returns a list with its parameters. 
        If the module does not have parameters return empty list. 
        """
        return []
        
    def getGradParameters(self):
        """
        Returns a list with gradients with respect to its parameters. 
        If the module does not have parameters return empty list. 
        """
        return []
    
    def train(self):
        """
        Sets training mode for the module.
        Training and testing behaviour differs for Dropout, BatchNorm.
        """
        self.training = True
    
    def evaluate(self):
        """
        Sets evaluation mode for the module.
        Training and testing behaviour differs for Dropout, BatchNorm.
        """
        self.training = False
    
    def __repr__(self):
        """
        Pretty printing. Should be overrided in every module if you want 
        to have readable description. 
        """
        return "Module"

# Sequential container

**Define** a forward and backward pass procedures.

In [4]:
class Sequential(Module):
    """
         This class implements a container, which processes `input` data sequentially. 
         
         `input` is processed by each module (layer) in self.modules consecutively.
         The resulting array is called `output`. 
    """
    
    def __init__ (self):
        super(Sequential, self).__init__()
        self.modules = []
   
    def add(self, module):
        """
        Adds a module to the container.
        """
        self.modules.append(module)

    def updateOutput(self, input):
        """
        Basic workflow of FORWARD PASS:
        
            y_0    = module[0].forward(input)
            y_1    = module[1].forward(y_0)
            ...
            output = module[n-1].forward(y_{n-2})   
            
            
        Just write a little loop. 
        """
        module = self.modules[0].forward(input)
        for i in range(1,len(self.modules)):
            module = self.modules[i].forward(module)
        self.output = module
        return self.output

    def backward(self, input, gradOutput):
        """
        Workflow of BACKWARD PASS:
            
            g_{n-1} = module[n-1].backward(y_{n-2}, gradOutput)
            g_{n-2} = module[n-2].backward(y_{n-3}, g_{n-1})
            ...
            g_1 = module[1].backward(y_0, g_2)   
            gradInput = module[0].backward(input, g_1)   
             
             
        !!!
                
        To ech module you need to provide the input, module saw while forward pass, 
        it is used while computing gradients. 
        Make sure that the input for `i-th` layer the output of `module[i]` (just the same input as in forward pass) 
        and NOT `input` to this Sequential module. 
        
        !!!
        
        """
        # Your code goes here. ################################################
        n = len(self.modules)
        g = self.modules[n-1].backward(y[n-2],gradOutput)
        for i in range(n-2,-1,-1):
            g = self.modules[i].backward(self.modules[i-1].output,g)
        g_first = self.modules[1].backward(self.modules[0].output,g)
        self.gradInput = self.modules[0].backward(input,g_first)
        return self.gradInput
      

    def zeroGradParameters(self): 
        for module in self.modules:
            module.zeroGradParameters()
    
    def getParameters(self):
        """
        Should gather all parameters in a list.
        """
        return [x.getParameters() for x in self.modules]
    
    def getGradParameters(self):
        """
        Should gather all gradients w.r.t parameters in a list.
        """
        return [x.getGradParameters() for x in self.modules]
    
    def __repr__(self):
        string = "".join([str(x) + '\n' for x in self.modules])
        return string
    
    def __getitem__(self,x):
        return self.modules.__getitem__(x)
    
    def train(self):
        """
        Propagates training parameter through all modules
        """
        self.training = True
        for module in self.modules:
            module.train()
    
    def evaluate(self):
        """
        Propagates training parameter through all modules
        """
        self.training = False
        for module in self.modules:
            module.evaluate()

In [5]:
def test_Sequential():
    # Unfortunately this test you cannot run right now. 
    # It would be best to run it after implementing Linear layer and test it with Linear
    # instead of Batch norm
    
    # TODO: as a student you can try to fix it, or not. 

    np.random.seed(42)
    torch.manual_seed(42)

    batch_size, n_in = 2, 4
    for i in range(100):
        print(f"Iter {i}")
        # layers initialization
        alpha = 0.9
        torch_layer = torch.nn.BatchNorm1d(n_in, eps=BatchNormalization.EPS, momentum=1.-alpha, affine=True)
        torch_layer.bias.data = torch.from_numpy(np.random.random(n_in).astype(np.float32))
        custom_layer = Sequential()
        bn_layer = BatchNormalization(alpha)
        bn_layer.moving_mean = torch_layer.running_mean.numpy().copy()
        bn_layer.moving_variance = torch_layer.running_var.numpy().copy()
        custom_layer.add(bn_layer)
        scaling_layer = ChannelwiseScaling(n_in)
        scaling_layer.gamma = torch_layer.weight.data.numpy()
        scaling_layer.beta = torch_layer.bias.data.numpy()
        custom_layer.add(scaling_layer)
        custom_layer.train()

        layer_input = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)
        next_layer_grad = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)

        # 1. check layer output
        custom_layer_output = custom_layer.updateOutput(layer_input)
        layer_input_var = Variable(torch.from_numpy(layer_input), requires_grad=True)
        torch_layer_output_var = torch_layer(layer_input_var)
        assertAlmostEqual(torch_layer_output_var.data.numpy(), custom_layer_output, msg='1. check layer output')

        # 2. check layer input grad
        custom_layer_grad = custom_layer.backward(layer_input, next_layer_grad)
        torch_layer_output_var.backward(torch.from_numpy(next_layer_grad))
        torch_layer_grad_var = layer_input_var.grad
        assertAlmostEqual(torch_layer_grad_var.data.numpy(), custom_layer_grad, msg='2. check layer input grad')

        # 3. check layer parameters grad
        weight_grad, bias_grad = custom_layer.getGradParameters()[1]
        torch_weight_grad = torch_layer.weight.grad.data.numpy()
        torch_bias_grad = torch_layer.bias.grad.data.numpy()
        assertAlmostEqual(torch_weight_grad, weight_grad, msg='3. check layer parameters grad. weights')
        assertAlmostEqual(torch_bias_grad, bias_grad, msg='3. check layer parameters grad. bias')
        
test_Sequential()

Iter 0


NameError: name 'BatchNormalization' is not defined

# Layers

## 1. Linear transform layer
Also known as dense layer, fully-connected layer, FC-layer, InnerProductLayer (in caffe), affine transform
- input:   **`batch_size x n_feats1`**
- output: **`batch_size x n_feats2`**

In [6]:
class Linear(Module):
    """
    A module which applies a linear transformation 
    A common name is fully-connected layer, InnerProductLayer in caffe. 
    
    The module should work with 2D input of shape (n_samples, n_feature).
    """
    def __init__(self, n_in, n_out):
        super(Linear, self).__init__()
        self.learning_rate = 0.1
        # This is a nice initialization
        stdv = 1./np.sqrt(n_in)
        self.W = np.random.uniform(-stdv, stdv, size = (n_out,n_in)) 
        print("W shape is {0} in init1\n".format(self.W.shape))
        #was n_out,n_in, change to n_in,n_out -> doesn't change shape for np.dot. WTF?
        self.b = np.random.uniform(-stdv, stdv, size = (n_out, 1))
        self.gradW = np.zeros_like(self.W)
        print("W shape is {0} in init2\n".format(self.W.shape))
        print("Wgrad shape is {0} in init\n".format(self.gradW.shape))
        self.gradb = np.zeros_like(self.b)
        
    def updateOutput(self, input):
        print("W shape is {0} in updateOutput\n".format(self.W.shape))
        self.output = np.dot(input, self.W.T)
        self.output += self.b
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        print("W shape is {0} in updateGrad\n".format(self.W.shape))
        print("Wgrad shape is {0} in upgradeGrad1\n".format(self.gradW.shape))
        self.gradInput = gradOutput @ self.W
        print("Wgrad shape is {0} in upgradeGrad2\n".format(self.gradW.shape))
        return self.gradInput
    
    def accGradParameters(self, input, gradOutput):
        print("W shape is {0} in accGrad1\n".format(self.W.shape))
        print("Wgrad shape is {0} in accGrad0\n".format(self.gradW.shape))
        self.gradW = (input.T @ gradOutput).T
        self.gradb = np.sum(gradOutput,axis=0)
        print("Wgrad shape is {0} in accGrad1\n".format(self.gradW.shape))
        self.W = np.subtract(self.W, np.multiply(self.learning_rate, self.gradW))
        # there it tells me that gradW and W has 3,4 and 4,3 shape. Come on, grad created based on W shape
        print("W shape is {0}in accGrad2\n".format(self.W.shape))
        print("Wgrad shape is {0} in accGrad2\n".format(self.gradW.shape))
        self.b = np.subtract(self.b, np.multiply(self.learning_rate, self.gradb))
        pass
    
    def zeroGradParameters(self):
        self.gradW.fill(0)
        self.gradb.fill(0)
        
    def getParameters(self):
        return [self.W, self.b]
    
    def getGradParameters(self):
        return [self.gradW, self.gradb]
    
    def __repr__(self):
        s = self.W.shape
        q = 'Linear %d -> %d' %(s[1],s[0])
        return q

In [9]:
def test_Linear():
        np.random.seed(42)
        torch.manual_seed(42)

        batch_size, n_in, n_out = 2, 3, 4
        for i in range(100):
            print(f"Iter {i}")
            # layers initialization
            torch_layer = torch.nn.Linear(n_in, n_out)
            custom_layer = Linear(n_in, n_out)
            custom_layer.W = torch_layer.weight.data.numpy()
            custom_layer.b = torch_layer.bias.data.numpy()

            layer_input = np.random.uniform(-10, 10, (batch_size, n_in)).astype(np.float32)
            next_layer_grad = np.random.uniform(-10, 10, (batch_size, n_out)).astype(np.float32)

            # 1. check layer output
            custom_layer_output = custom_layer.updateOutput(layer_input)
            layer_input_var = Variable(torch.from_numpy(layer_input), requires_grad=True)
            torch_layer_output_var = torch_layer(layer_input_var)
            assertAlmostEqual(torch_layer_output_var.data.numpy(), custom_layer_output, msg="1. check layer output")
        
            # 2. check layer input grad
            custom_layer_grad = custom_layer.updateGradInput(layer_input, next_layer_grad)
            torch_layer_output_var.backward(torch.from_numpy(next_layer_grad))
            torch_layer_grad_var = layer_input_var.grad
            assertAlmostEqual(torch_layer_grad_var.data.numpy(), custom_layer_grad, msg='2. check layer input grad')

            # 3. check layer parameters grad
            custom_layer.accGradParameters(layer_input, next_layer_grad)
            weight_grad = custom_layer.gradW
            bias_grad = custom_layer.gradb
            torch_weight_grad = torch_layer.weight.grad.data.numpy()
            torch_bias_grad = torch_layer.bias.grad.data.numpy()
            print(torch_weight_grad)
            print(weight_grad)
            assertAlmostEqual(torch_weight_grad, weight_grad, msg='3. check layer parameters grad. Weight')
            assertAlmostEqual(torch_bias_grad, bias_grad, msg='3. check layer parameters grad. Bias')
            
test_Linear()

Iter 0
W shape is (4, 3) in init1

W shape is (4, 3) in init2

Wgrad shape is (4, 3) in init

W shape is (4, 3) in updateOutput

1. check layer output: OK!
W shape is (4, 3) in updateGrad

Wgrad shape is (4, 3) in upgradeGrad1

Wgrad shape is (4, 3) in upgradeGrad2

2. check layer input grad: OK!
W shape is (4, 3) in accGrad1

Wgrad shape is (4, 3) in accGrad0

Wgrad shape is (4, 3) in accGrad1

W shape is (4, 3)in accGrad2

Wgrad shape is (4, 3) in accGrad2

[[ 41.35547    -15.495246    48.966015  ]
 [  9.275587    -0.68649626   1.5852559 ]
 [ -4.2775316    3.699715   -12.130545  ]
 [ 15.5450115  -17.468357    57.639904  ]]
[[ 41.355465   -15.495246    48.966015  ]
 [  9.275588    -0.68649626   1.5852559 ]
 [ -4.2775316    3.6997147  -12.130545  ]
 [ 15.545012   -17.468357    57.639904  ]]
3. check layer parameters grad. Weight: OK!
3. check layer parameters grad. Bias: OK!
Iter 1
W shape is (4, 3) in init1

W shape is (4, 3) in init2

Wgrad shape is (4, 3) in init

W shape is (4, 3) 

AssertionError: 2. check layer input grad: FAILED:
  expected=[[ 1.0169905e+00 -4.1093044e+00  9.3623996e-01]
 [ 9.9876171e-01 -7.0998330e+00  2.3865700e-03]]\  actual=[[ 1.0169904e+00 -4.1093044e+00  9.3624002e-01]
 [ 9.9876159e-01 -7.0998330e+00  2.3866408e-03]]

## 2. SoftMax
- input:   **`batch_size x n_feats`**
- output: **`batch_size x n_feats`**

$\text{softmax}(x)_i = \frac{\exp x_i} {\sum_j \exp x_j}$

Recall that $\text{softmax}(x) == \text{softmax}(x - \text{const})$. It makes possible to avoid computing exp() from large argument.

In [7]:
class SoftMax(Module):
    def __init__(self):
         super(SoftMax, self).__init__()
# softmax(𝑥)==softmax(𝑥−const)
    
    def softmax(input):
        exp = np.exp(input - np.max(input, axis=1, keepdims=True))
        return exp / np.sum(exp, axis=1, keepdims=True)
    
    def updateOutput(self, input):
        print(input.shape) # (2,4)
        # start with normalization for numerical stability
        self.output = np.subtract(input, input.max(axis=1, keepdims=True))
        self.output = SoftMax.softmax(self.output)
        # Your code goes here. ################################################
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        # Your code goes here. ################################################
        # ai(1−ai) ; since i=j
        # −aiaj ; since i!=j
        grad = np.zeros_like(self.output)
        #for i in range(self.output.shape[0]):
        #    for j in range(self.output.shape[1]):
        #        sum1 = np.sum(self.output[i], axis=1, keepdims=True)
        #       sum2 = np.sum(self.output[j], axis=1, keepdims=True)
        #        if(i==j):            
        #           grad[i][j] =  sum1 * (1. -  sum1)
        #        else:
        #            grad[i][j] = -sum1 * sum2
        #np.fill_diagonal(grad, self.output[i] * (1. - self.output[i]))
        #grad = -self.output[i] * self.output[j]
        self.gradInput = gradOutput * grad
        return self.gradInput
    
    def __repr__(self):
        return "SoftMax"

In [8]:
def test_SoftMax():
    np.random.seed(42)
    torch.manual_seed(42)

    batch_size, n_in = 2, 4
    for i in range(100):
        print(f"test_SoftMax. Iter {i}")
        # layers initialization
        torch_layer = torch.nn.Softmax(dim=1)
        custom_layer = SoftMax()

        layer_input = np.random.uniform(-10, 10, (batch_size, n_in)).astype(np.float32)
        next_layer_grad = np.random.random((batch_size, n_in)).astype(np.float32)
        next_layer_grad /= next_layer_grad.sum(axis=-1, keepdims=True)
        next_layer_grad = next_layer_grad.clip(1e-5,1.)
        next_layer_grad = 1. / next_layer_grad

        # 1. check layer output
        custom_layer_output = custom_layer.updateOutput(layer_input)
        layer_input_var = Variable(torch.from_numpy(layer_input), requires_grad=True)
        torch_layer_output_var = torch_layer(layer_input_var)
        assertAlmostEqual(torch_layer_output_var.data.numpy(), custom_layer_output, msg='1. check layer output')

        # 2. check layer input grad
        custom_layer_grad = custom_layer.updateGradInput(layer_input, next_layer_grad)
        torch_layer_output_var.backward(torch.from_numpy(next_layer_grad))
        torch_layer_grad_var = layer_input_var.grad
        assertAlmostEqual(torch_layer_grad_var.data.numpy(), custom_layer_grad, msg='2. check layer input grad')
        
test_SoftMax()

test_SoftMax. Iter 0
(2, 4)
1. check layer output: OK!
2. check layer input grad: FAILED:
  expected=[[-7.5124099e-06 -1.3294163e+00  1.3313441e+00 -1.9205434e-03]
 [-4.0679720e-06 -7.1062163e-07  6.3931580e-09  4.7683648e-06]]
  actual=[[0. 0. 0. 0.]
 [0. 0. 0. 0.]]


AssertionError: 2. check layer input grad: FAILED:
  expected=[[-7.5124099e-06 -1.3294163e+00  1.3313441e+00 -1.9205434e-03]
 [-4.0679720e-06 -7.1062163e-07  6.3931580e-09  4.7683648e-06]]\  actual=[[0. 0. 0. 0.]
 [0. 0. 0. 0.]]

## 3. LogSoftMax
- input:   **`batch_size x n_feats`**
- output: **`batch_size x n_feats`**

$\text{logsoftmax}(x)_i = \log\text{softmax}(x)_i = x_i - \log {\sum_j \exp x_j}$

The main goal of this layer is to be used in computation of log-likelihood loss.

In [None]:
class LogSoftMax(Module):
    def __init__(self):
         super(LogSoftMax, self).__init__()
    
    def updateOutput(self, input):
        # start with normalization for numerical stability
        self.output = np.subtract(input, input.max(axis=1, keepdims=True))
        self.output = np.subtract(self.output,np.log(np.sum(np.exp(self.output),axis=1,keepdim=True)))
        # Your code goes here. ################################################
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        # Your code goes here. ################################################
        return self.gradInput
    
    def __repr__(self):
        return "LogSoftMax"

In [None]:
def test_LogSoftMax():
    np.random.seed(42)
    torch.manual_seed(42)

    batch_size, n_in = 2, 4
    for i in range(100):
        print(f"test_LogSoftMax. Iter {i}")
        # layers initialization
        torch_layer = torch.nn.LogSoftmax(dim=1)
        custom_layer = LogSoftMax()

        layer_input = np.random.uniform(-10, 10, (batch_size, n_in)).astype(np.float32)
        next_layer_grad = np.random.random((batch_size, n_in)).astype(np.float32)
        next_layer_grad /= next_layer_grad.sum(axis=-1, keepdims=True)

        # 1. check layer output
        custom_layer_output = custom_layer.updateOutput(layer_input)
        layer_input_var = Variable(torch.from_numpy(layer_input), requires_grad=True)
        torch_layer_output_var = torch_layer(layer_input_var)
        assertAlmostEqual(torch_layer_output_var.data.numpy(), custom_layer_output, msg='1. check layer output')

        # 2. check layer input grad
        custom_layer_grad = custom_layer.updateGradInput(layer_input, next_layer_grad)
        torch_layer_output_var.backward(torch.from_numpy(next_layer_grad))
        torch_layer_grad_var = layer_input_var.grad
        assertAlmostEqual(torch_layer_grad_var.data.numpy(), custom_layer_grad, msg='2. check layer input grad')
        
test_LogSoftMax()

## 4. Batch normalization
One of the most significant recent ideas that impacted NNs a lot is [**Batch normalization**](http://arxiv.org/abs/1502.03167). The idea is simple, yet effective: the features should be whitened ($mean = 0$, $std = 1$) all the way through NN. This improves the convergence for deep models letting it train them for days but not weeks. **You are** to implement the first part of the layer: features normalization. The second part (`ChannelwiseScaling` layer) is implemented below.

- input:   **`batch_size x n_feats`**
- output: **`batch_size x n_feats`**

The layer should work as follows. While training (`self.training == True`) it transforms input as $$y = \frac{x - \mu}  {\sqrt{\sigma + \epsilon}}$$
where $\mu$ and $\sigma$ - mean and variance of feature values in **batch** and $\epsilon$ is just a small number for numericall stability. Also during training, layer should maintain exponential moving average values for mean and variance: 
```
    self.moving_mean = self.moving_mean * alpha + batch_mean * (1 - alpha)
    self.moving_variance = self.moving_variance * alpha + batch_variance * (1 - alpha)
```
During testing (`self.training == False`) the layer normalizes input using moving_mean and moving_variance. 

Note that decomposition of batch normalization on normalization itself and channelwise scaling here is just a common **implementation** choice. In general "batch normalization" always assumes normalization + scaling.

In [30]:
class BatchNormalization(Module):
    EPS = 1e-3
    def __init__(self, alpha = 0.):
        super(BatchNormalization, self).__init__()
        self.alpha = alpha
        self.moving_mean = None 
        self.moving_variance = None
        
    def updateOutput(self, input):
        # Your code goes here. ################################################
        # use self.EPS please
        self.output = input
        #print(dir(input))
        if (self.training):
            batch_mean = input.mean()
            batch_variance = input.var() #input.var()
            self.moving_mean = self.moving_mean * self.alpha + batch_mean * (1 - self.alpha)
            self.moving_variance = self.moving_variance * self.alpha + batch_variance * (1 - self.alpha)
            
            self.output = np.subtract(input,self.moving_mean) / np.sqrt(self.moving_mean + self.EPS)
        #else:
            
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        #if (self.training):
        
        #else:
        self.gradInput = gradOutput    
        # Your code goes here. ################################################
        return self.gradInput
    
    def __repr__(self):
        return "BatchNormalization"

In [31]:
def test_BatchNormalization():
    np.random.seed(42)
    torch.manual_seed(42)

    batch_size, n_in = 32, 16
    for _ in range(100):
        # layers initialization
        slope = np.random.uniform(0.01, 0.05)
        alpha = 0.9
        custom_layer = BatchNormalization(alpha)
        custom_layer.train()
        torch_layer = torch.nn.BatchNorm1d(n_in, eps=custom_layer.EPS, momentum=1.-alpha, affine=False)
        custom_layer.moving_mean = torch_layer.running_mean.numpy().copy()
        custom_layer.moving_variance = torch_layer.running_var.numpy().copy()

        layer_input = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)
        next_layer_grad = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)

        # 1. check layer output
        custom_layer_output = custom_layer.updateOutput(layer_input)
        layer_input_var = Variable(torch.from_numpy(layer_input), requires_grad=True)
        torch_layer_output_var = torch_layer(layer_input_var)
        assertAlmostEqual(torch_layer_output_var.data.numpy(), custom_layer_output, atol=1e-6)

        # 2. check layer input grad
        custom_layer_grad = custom_layer.updateGradInput(layer_input, next_layer_grad)
        torch_layer_output_var.backward(torch.from_numpy(next_layer_grad))
        torch_layer_grad_var = layer_input_var.grad
        # please, don't increase `atol` parameter, it's garanteed that you can implement batch norm layer
        # with tolerance 1e-5
        assertAlmostEqual(torch_layer_grad_var.data.numpy(), custom_layer_grad, atol=1e-5)

        # 3. check moving mean
        assertAlmostEqual(custom_layer.moving_mean, torch_layer.running_mean.numpy())
        # we don't check moving_variance because pytorch uses slightly different formula for it:
        # it computes moving average for unbiased variance (i.e var*N/(N-1))
        #self.assertTrue(np.allclose(custom_layer.moving_variance, torch_layer.running_var.numpy()))

        # 4. check evaluation mode
        custom_layer.moving_variance = torch_layer.running_var.numpy().copy()
        custom_layer.evaluate()
        custom_layer_output = custom_layer.updateOutput(layer_input)
        torch_layer.eval()
        torch_layer_output_var = torch_layer(layer_input_var)
        assertAlmostEqual(torch_layer_output_var.data.numpy(), custom_layer_output, atol=1e-6)
            
test_BatchNormalization()

???: FAILED:
  expected=[[ 1.41016996e+00  7.32519150e-01  4.50298339e-01 -9.46392715e-01
  -1.47206545e+00 -1.37083197e+00  1.37724662e+00  3.88500780e-01
   5.41669905e-01 -1.58092630e+00  1.23230290e+00  1.08739460e+00
  -6.12358093e-01 -1.12429333e+00 -1.24007118e+00 -7.80916631e-01]
 [ 2.95836106e-03 -2.94019282e-01 -6.19553506e-01  4.92805600e-01
  -1.52739215e+00 -6.47969484e-01 -5.03135085e-01 -4.42811698e-02
   8.19093764e-01 -9.12917614e-01 -2.08963335e-01  2.73858875e-01
  -1.26506019e+00  3.70364130e-01 -1.28933299e+00 -1.57288957e+00]
 [ 1.40412831e+00  1.53185034e+00  1.18018794e+00 -4.77235675e-01
  -1.66762042e+00  5.62936842e-01 -2.25522816e-01 -1.04095721e+00
  -2.24344641e-01 -1.52943671e+00  1.04066312e+00 -8.56940687e-01
   1.15892267e+00 -6.68276966e-01  4.75193001e-02  2.19079480e-02]
 [-1.11995554e+00  1.54537308e+00  1.06442773e+00  1.52727723e+00
   1.00523961e+00  2.96310157e-01  1.58679175e+00 -1.14105034e+00
  -1.30086732e+00 -1.48900783e+00 -8.06452513e-01

AssertionError: ???: FAILED:
  expected=[[ 1.41016996e+00  7.32519150e-01  4.50298339e-01 -9.46392715e-01
  -1.47206545e+00 -1.37083197e+00  1.37724662e+00  3.88500780e-01
   5.41669905e-01 -1.58092630e+00  1.23230290e+00  1.08739460e+00
  -6.12358093e-01 -1.12429333e+00 -1.24007118e+00 -7.80916631e-01]
 [ 2.95836106e-03 -2.94019282e-01 -6.19553506e-01  4.92805600e-01
  -1.52739215e+00 -6.47969484e-01 -5.03135085e-01 -4.42811698e-02
   8.19093764e-01 -9.12917614e-01 -2.08963335e-01  2.73858875e-01
  -1.26506019e+00  3.70364130e-01 -1.28933299e+00 -1.57288957e+00]
 [ 1.40412831e+00  1.53185034e+00  1.18018794e+00 -4.77235675e-01
  -1.66762042e+00  5.62936842e-01 -2.25522816e-01 -1.04095721e+00
  -2.24344641e-01 -1.52943671e+00  1.04066312e+00 -8.56940687e-01
   1.15892267e+00 -6.68276966e-01  4.75193001e-02  2.19079480e-02]
 [-1.11995554e+00  1.54537308e+00  1.06442773e+00  1.52727723e+00
   1.00523961e+00  2.96310157e-01  1.58679175e+00 -1.14105034e+00
  -1.30086732e+00 -1.48900783e+00 -8.06452513e-01 -4.16675121e-01
  -3.80178988e-01  1.14694810e+00 -5.77087879e-01 -8.58089745e-01]
 [ 6.22245148e-02 -1.28967035e+00  1.15861082e+00 -1.20361042e+00
   1.31391525e+00  8.34747970e-01 -1.13384676e+00 -1.38861549e+00
   9.28063095e-01  9.78892982e-01  4.70346242e-01  8.80061150e-01
  -1.15648842e+00 -5.04126310e-01 -1.49836469e+00  1.06950259e+00]
 [ 3.28504682e-01 -6.39724970e-01 -1.41184616e+00 -4.57128316e-01
  -9.04775739e-01  7.03065097e-01  5.17146409e-01  1.24215245e+00
  -3.06963593e-01 -1.21161675e+00  4.20491040e-01  8.44523013e-01
   7.60565996e-01  9.44122612e-01 -5.29612228e-02 -5.74826635e-02]
 [-3.18206370e-01 -1.68484068e+00 -1.25756729e+00 -1.33975708e+00
   1.38768137e-01 -5.79373121e-01  3.18774097e-02  1.30288339e+00
  -1.10905623e+00 -1.26965895e-01  5.54302752e-01 -9.58559096e-01
  -1.14493942e+00 -7.45374858e-01 -1.32491231e+00  1.28999960e+00]
 [ 9.39090848e-01  3.95218790e-01  1.39964759e+00  1.09843326e+00
  -1.36954582e+00  1.20632052e+00  1.47644922e-01  1.00412893e+00
   1.21817541e+00 -4.71544147e-01 -1.48736143e+00 -9.61484075e-01
   2.32666001e-01  1.10930169e+00  1.35040438e+00 -1.76525998e+00]
 [-4.33227643e-02 -3.43743563e-01 -8.60095143e-01 -1.06053889e+00
  -8.63091767e-01  1.36182117e+00 -6.65505886e-01  1.42863214e-01
   5.23486614e-01 -3.01357239e-01  1.23822451e+00  1.52802396e+00
  -4.57165807e-01 -1.68746710e-02 -7.90785372e-01 -8.45156848e-01]
 [-1.60878682e+00  3.13658476e-01  1.16290532e-01 -1.27645493e+00
  -1.06081378e+00  1.25482881e+00 -9.80176806e-01 -9.72758174e-01
  -2.44940564e-01  2.01879978e+00 -1.06984508e+00  5.44060051e-01
   1.54882884e+00 -9.28341269e-01  8.43595505e-01 -5.70529401e-01]
 [ 3.58262867e-01  3.95649731e-01  2.31463164e-01 -1.15391743e+00
   8.05652916e-01 -5.59533298e-01 -1.17973459e+00 -1.28342819e+00
   1.20048836e-01  8.69629264e-01 -1.78298116e+00  1.62234041e-03
  -5.56657612e-01  5.02471805e-01 -1.27463794e+00  4.99452710e-01]
 [-4.53013629e-01  1.43296969e+00 -1.15445685e+00 -3.62144530e-01
  -1.61463833e+00  1.30556357e+00  1.41924441e+00 -6.35451913e-01
   3.68643790e-01  1.39055729e+00 -7.93898702e-02  6.11306690e-02
  -4.96236235e-01 -1.43578756e+00  1.48994422e+00  1.19305325e+00]
 [ 3.60891342e-01 -6.11904263e-01 -4.17782009e-01  8.53060544e-01
   1.01289427e+00  1.18941927e+00  1.05256987e+00  5.10586858e-01
  -1.70328653e+00 -1.05482686e+00  1.00661039e+00  3.21358711e-01
  -1.41163647e+00 -1.40640569e+00  5.96090555e-01 -1.77151966e+00]
 [-1.19939613e+00  1.05542779e-01  7.74761438e-01  6.19439125e-01
  -1.24314022e+00  6.49244547e-01 -9.88877952e-01 -4.34172124e-01
   6.79903805e-01  7.65443981e-01  8.50580871e-01  4.94837910e-01
   7.88231611e-01 -1.43377936e+00 -5.35161257e-01 -9.10179675e-01]
 [-9.24594164e-01  1.55709398e+00 -2.65051574e-01  1.37745655e+00
   1.21091820e-01  9.04441357e-01  9.55426414e-03  3.16260219e-01
  -2.33912706e-01 -9.29444611e-01  4.49613094e-01 -7.82401145e-01
  -1.35214984e+00  5.03523350e-01 -1.26414239e+00  1.32562959e+00]
 [ 1.42078876e+00  1.35816228e+00 -3.44879210e-01 -1.39018703e+00
   1.11753559e+00 -2.27831841e-01  1.75526357e+00  1.47013462e+00
   1.06316376e+00 -5.59403419e-01 -6.17412746e-01  1.15075493e+00
  -2.00868532e-01 -1.16759050e+00  1.88007846e-01  1.31137955e+00]
 [ 5.68783939e-01  1.78508818e-01 -1.29485512e+00  5.02764702e-01
   1.32453394e+00 -1.11758614e+00  6.85921833e-02  1.21279311e+00
   6.59312785e-01  9.42183495e-01  3.86455745e-01 -5.15596747e-01
  -2.92662948e-01  1.07891977e+00  1.15681589e+00  1.08264375e+00]
 [ 1.28637016e+00 -2.23820228e-02  1.12244241e-01  1.08145690e+00
   1.84213027e-01  6.17705226e-01  1.11245263e+00  1.25048494e+00
  -7.89896369e-01 -2.56770849e-01 -1.53818953e+00  2.25952566e-01
  -1.30640531e+00 -1.27996191e-01  1.33864954e-01 -8.39525521e-01]
 [ 2.21252427e-01 -1.66745687e+00 -1.50305724e+00  1.15819585e+00
  -7.87396252e-01 -1.15780723e+00  8.33158121e-02  8.92396629e-01
  -1.22948813e+00  6.65693939e-01 -1.56549966e+00 -1.55886626e+00
   6.42833352e-01  1.35451153e-01  4.96377140e-01  6.15848184e-01]
 [ 1.49321604e+00 -5.41970087e-03 -5.09142578e-01  1.07164097e+00
  -1.08701479e+00 -1.94516957e-01 -1.58628154e+00 -1.32945120e+00
   1.45765293e+00  1.46052504e+00  3.65865499e-01 -3.47954124e-01
  -7.65982866e-01 -1.21342766e+00 -9.84443605e-01  3.02398372e-02]
 [ 6.30119741e-01  4.86886114e-01 -6.58860803e-01  1.57579315e+00
   4.79052186e-01  1.61825076e-01  4.19944465e-01 -1.53099254e-01
  -1.11467373e+00 -3.29917789e-01  5.61561584e-01 -1.68524861e+00
  -9.91126060e-01 -1.60115123e+00 -1.78574336e+00  1.04419661e+00]
 [ 5.93984365e-01 -1.49544507e-01 -1.29256654e+00  1.13183148e-01
  -4.07564849e-01 -1.01530647e+00 -2.49227628e-01 -2.16043025e-01
   2.09846586e-01  7.11212158e-01 -1.69215405e+00 -4.64345008e-01
   1.01467180e+00  3.79660563e-03  1.33418548e+00  3.92690778e-01]
 [-1.19237137e+00 -1.53037310e+00  6.02585554e-01 -1.35528433e+00
  -3.10103670e-02  1.35354602e+00  2.83579081e-01 -2.46879801e-01
   3.08570981e-01  5.15905917e-02 -1.09703377e-01  1.45690715e+00
   7.13280514e-02  1.61197674e+00  1.52105665e+00 -1.14000404e+00]
 [-1.50150323e+00 -1.42702007e+00 -1.56961691e+00 -1.14080465e+00
   2.95005649e-01 -1.33035910e+00 -6.81409657e-01  1.11582708e+00
  -1.92229426e+00  1.38028586e+00 -9.43962336e-01 -1.33353257e+00
   1.29354393e+00  4.45490211e-01  1.41443312e+00  6.45580411e-01]
 [ 9.23763812e-01 -8.06898475e-01 -1.01554036e+00  9.30916309e-01
   7.10200608e-01  1.50881255e+00 -3.29113245e-01 -2.95073271e-01
   7.87563801e-01 -3.86499196e-01  1.10846639e+00  1.17541599e+00
   2.40087569e-01  8.73567283e-01  9.44282949e-01 -1.44683027e+00]
 [ 1.25106192e+00 -4.32174429e-02  1.24303710e+00 -4.28500354e-01
   1.00757301e+00 -3.48223358e-01 -1.84067416e+00  1.29636538e+00
  -1.67757213e+00 -4.66657221e-01  1.16952550e+00  1.48789370e+00
   8.08413208e-01  4.55652028e-01 -2.26405397e-01 -8.17442417e-01]
 [-6.44858658e-01  5.29039443e-01  9.85229015e-01  1.06025207e+00
   6.52473330e-01 -1.26853812e+00 -2.13586148e-02 -1.23334968e+00
  -2.87821610e-02 -1.07844472e-02  9.72292602e-01 -5.44664145e-01
  -9.87213492e-01 -1.26063287e+00  9.70931590e-01  2.58674026e-01]
 [-1.39657509e+00 -1.48405612e+00  8.06338608e-01 -1.20925438e+00
   7.60580420e-01  6.30909026e-01 -1.57539988e+00 -1.15195525e+00
   1.54397500e+00 -2.61665255e-01 -6.63134634e-01  1.02081764e+00
   2.27920032e+00  1.69908381e+00  9.39828515e-01 -5.42463362e-01]
 [-1.45479167e+00  8.86998117e-01  3.10213894e-01 -9.95984375e-02
   1.04388976e+00 -1.20679784e+00 -2.81124450e-02 -1.37121546e+00
  -3.19752187e-01 -1.44769406e+00 -1.45963526e+00 -1.33569694e+00
   1.10654569e+00  8.56623054e-01  2.89616942e-01  1.39752567e+00]
 [-4.92210567e-01 -7.94316828e-01  1.38968980e+00 -7.33032286e-01
   1.23456860e+00 -1.51267731e+00  1.76739264e+00 -1.27631259e+00
   1.20037150e+00  3.10634613e-01  1.30522394e+00 -1.48391163e+00
   7.31360018e-01  1.64045715e+00  5.91070466e-02  2.95693517e-01]
 [ 5.67855239e-01 -2.16712832e-01  5.50868690e-01  4.05858457e-01
   1.02646625e+00 -1.40986013e+00 -8.24418545e-01  1.43072331e+00
   1.19720757e+00  4.19069082e-02  1.25984386e-01 -7.93894947e-01
  -7.07645535e-01 -1.34665519e-01 -5.90095639e-01  1.44237578e-01]
 [-1.47384083e+00  1.56182992e+00  1.79897726e+00  7.65307069e-01
  -1.97584689e-01 -5.94284832e-01  1.18018043e+00  6.37992740e-01
  -1.42092049e+00  1.74007988e+00  7.66174614e-01  1.48515773e+00
   1.40757704e+00  3.90974283e-01 -3.41916680e-01  1.30003476e+00]]\  actual=[[ 72.12897     37.10405     15.7523     -55.13009    -55.133953
  -70.81297     58.5914      16.145678    33.273388   -76.81792
   75.20286     53.189465   -46.11118    -50.997578   -50.744637
  -31.394217  ]
 [  3.9179523  -10.944458   -33.47808     17.865194   -57.776295
  -33.33147    -21.446657    -7.0811925   45.620388   -48.13935
    2.2330067   14.752427   -72.67586     17.175323   -52.807247
  -69.69715   ]
 [ 71.83611     74.51782     49.338955   -31.334723   -64.47344
   29.45583     -9.630149   -60.57156     -0.81878185 -74.6074
   65.50033    -38.674343    25.979166   -30.19819      3.1671708
    7.433544  ]
 [-50.512447    75.15077     44.012123    70.332985    63.179474
   15.6308365   67.51064    -65.94343    -48.730396   -72.87173
  -28.017263   -17.873161   -36.66159     52.596138   -22.985323
  -35.126614  ]
 [  6.7907286  -57.547245    48.346058   -68.17602     77.921486
   43.54964    -48.292774   -79.22994     50.470165    33.07879
   36.625763    43.3936     -68.25704    -22.711107   -61.559464
   58.099346  ]
 [ 19.697985   -27.125677   -69.93627    -30.31489    -28.040804
   36.721684    21.981413    61.960106    -4.495813   -60.962933
   34.101646    41.714535     9.766227    43.345055    -1.0399792
    3.5939016 ]
 [-11.649695   -76.04372    -62.83697    -75.08129     21.797728
  -29.774647     1.3260391   65.21946    -40.19367    -14.397317
   40.8764     -43.4755     -67.787      -33.714706   -54.29696
   68.76345   ]
 [ 49.2946      21.316254    59.437622    48.58227    -50.23772
   62.81623      6.2536592   49.185677    63.381878   -29.190554
  -62.490997   -43.613693   -11.71907     50.879047    57.719368
  -79.00094   ]
 [  1.674592   -13.271871   -44.54686    -60.919514   -26.050024
   70.87917    -28.357935     2.9626138   32.464127   -21.884188
   75.50266     74.007835   -39.79492     -0.48704565 -31.932901
  -34.50113   ]
 [-74.20731     17.498714     0.38257343 -71.87064    -35.49301
   65.33146    -41.751842   -56.911404    -1.735422    77.72348
  -41.352562    27.518593    41.848175   -42.059998    36.499123
  -21.219059  ]
 [ 21.140436    21.336424     5.6823664  -65.65562     53.647427
  -28.745922   -50.245975   -73.58467     14.50876     28.38794
  -77.457924     1.8900954  -43.844193    23.200895   -52.191956
   30.529491  ]
 [-18.184135    69.88958    -58.09223    -25.497366   -61.943077
   67.96213     60.379025   -38.808594    25.572704    50.75213
    8.79318      4.7016788  -41.38507    -65.205154    63.561943
   64.074745  ]
 [ 21.267845   -25.823492   -24.193348    36.13711     63.54505
   61.939877    44.7716      22.697887   -66.64043    -54.231716
   63.77627     16.996647   -78.64145    -63.86502     26.136017
  -79.30369   ]
 [-54.363132     7.7575803   30.682816    24.287971   -44.200726
   33.93101    -42.122204   -28.006155    39.425613    23.915117
   55.87666     25.192999    10.892204   -65.11356    -21.229841
  -37.64589   ]
 [-41.042805    75.69939    -17.165293    62.73417     20.953526
   47.163345     0.37585893  12.268615    -1.244617   -48.848877
   35.576065   -35.15258    -76.220375    23.248857   -51.752506
   70.48666   ]
 [ 72.643684    66.38812    -20.838646   -77.63907     68.54261
  -11.546707    74.68161     74.195625    56.482944   -32.96248
  -18.446375    56.18304    -29.363735   -52.972404     9.049469
   69.79747   ]
 [ 31.344912    11.1728525  -64.55281     18.370316    78.42863
  -57.681793     2.888796    60.384426    38.50919     31.502796
   32.378475   -22.546902   -33.099728    49.493294    49.613758
   58.734905  ]
 [ 66.12809      1.7698882    0.19637921  47.72124     23.968128
   32.29565     47.3205      62.4073     -25.989174   -19.97003
  -65.06438     12.489002   -74.35859     -5.555415     6.78249
  -34.228775  ]
 [ 14.499204   -75.23005    -74.13345     51.613396   -22.434887
  -59.76732      3.515504    43.18914    -45.5536      19.632704
  -66.44706    -71.83814      4.974559     6.4606953   21.960985
   36.158836  ]
 [ 76.15441      2.5638325  -28.397402    47.223377   -36.744347
   -9.819282   -67.55057    -76.054665    74.04004     53.75594
   31.33601    -14.626307   -52.363636   -55.063087   -40.04143
    7.8365073 ]
 [ 34.31801     25.606867   -35.286846    72.79368     38.049328
    8.657584    17.844032   -12.921325   -40.44368    -23.110332
   41.243904   -77.80932    -61.526867   -72.74757    -73.592094
   56.875454  ]
 [ 32.566437    -4.1821227  -64.447495    -1.3890303   -4.2945476
  -52.378445   -10.639139   -16.299437    18.505291    21.586868
  -72.85944    -20.125416    20.108217     0.45579308  57.040276
   25.366064  ]
 [-54.022625   -68.81366     22.75996    -75.86883     13.68929
   70.45009     12.039668   -17.954409    22.899109    -6.7316313
    7.2584376   70.64779    -18.285454    73.80659     64.864624
  -48.7611    ]
 [-69.00701    -63.97608    -77.196266   -64.99055     29.259462
  -68.71439    -29.034878    55.180374   -76.38756     50.31116
  -34.979248   -61.191833    31.458187    20.601904    60.40027
   37.596806  ]
 [ 48.551662   -34.950455   -51.69984     40.085907    49.088726
   78.50089    -14.039453   -20.540892    44.217117   -25.539452
   68.93314     57.3482     -11.417016    40.12695     40.71494
  -63.60042   ]
 [ 64.41661      0.7946582   52.231026   -28.862896    63.290913
  -17.789188   -78.37873     64.869644   -65.49599    -28.980751
   72.024506    72.11181     11.713585    21.065395    -8.30214
  -33.160748  ]
 [-27.483337    27.57991     40.36771     46.645744    46.331734
  -65.50888     -0.93994135 -70.89702      7.884903    -9.409481
   62.038803   -23.920244   -61.36763    -57.216167    41.830723
   18.884485  ]
 [-63.920887   -66.64573     32.135876   -68.46228     51.494812
   32.980286   -67.08739    -66.52869     77.88188    -20.180153
  -20.761229    50.043907    71.573975    77.77963     40.528427
  -19.861677  ]
 [-66.74279     44.33465      9.306165   -12.181187    65.02536
  -62.307552    -1.2274168  -78.296104    -5.0649805  -71.09807
  -61.08725    -61.29409     23.847445    39.354107    13.303872
   73.96384   ]
 [-20.084106   -34.361557    58.979404   -44.308575    74.13198
  -78.16786     75.19788    -73.20279     62.589497     4.3894997
   78.89477    -68.29677      8.577554    75.105606     3.6523533
   20.674894  ]
 [ 31.299896    -7.326027    20.380154    13.455288    64.19324
  -72.83664    -35.12202     72.08047     62.448685    -7.1473656
   19.19107    -35.69563    -49.989334    -5.8596106  -23.52996
   13.349887  ]
 [-67.666145    75.92105     77.8132      31.686308     5.733879
  -30.54784     50.203316    29.5356     -54.073467    65.75764
   51.603256    71.98254     36.09929     18.115376   -13.138633
   69.248795  ]]

In [None]:
class ChannelwiseScaling(Module):
    """
       Implements linear transform of input y = \gamma * x + \beta
       where \gamma, \beta - learnable vectors of length x.shape[-1]
    """
    def __init__(self, n_out):
        super(ChannelwiseScaling, self).__init__()

        stdv = 1./np.sqrt(n_out)
        self.gamma = np.random.uniform(-stdv, stdv, size=n_out)
        self.beta = np.random.uniform(-stdv, stdv, size=n_out)
        
        self.gradGamma = np.zeros_like(self.gamma)
        self.gradBeta = np.zeros_like(self.beta)

    def updateOutput(self, input):
        self.output = input * self.gamma + self.beta
        return self.output
        
    def updateGradInput(self, input, gradOutput):
        self.gradInput = gradOutput * self.gamma
        return self.gradInput
    
    def accGradParameters(self, input, gradOutput):
        self.gradBeta = np.sum(gradOutput, axis=0)
        self.gradGamma = np.sum(gradOutput*input, axis=0)
    
    def zeroGradParameters(self):
        self.gradGamma.fill(0)
        self.gradBeta.fill(0)
        
    def getParameters(self):
        return [self.gamma, self.beta]
    
    def getGradParameters(self):
        return [self.gradGamma, self.gradBeta]
    
    def __repr__(self):
        return "ChannelwiseScaling"

Practical notes. If BatchNormalization is placed after a linear transformation layer (including dense layer, convolutions, channelwise scaling) that implements function like `y = weight * x + bias`, than bias adding become useless and could be omitted since its effect will be discarded while batch mean subtraction. If BatchNormalization (followed by `ChannelwiseScaling`) is placed before a layer that propagates scale (including ReLU, LeakyReLU) followed by any linear transformation layer than parameter `gamma` in `ChannelwiseScaling` could be freezed since it could be absorbed into the linear transformation layer.

## 5. Dropout
Implement [**dropout**](https://www.cs.toronto.edu/~hinton/absps/JMLRdropout.pdf). The idea and implementation is really simple: just multimply the input by $Bernoulli(p)$ mask. Here $p$ is probability of an element to be zeroed.

This has proven to be an effective technique for regularization and preventing the co-adaptation of neurons.

While training (`self.training == True`) it should sample a mask on each iteration (for every batch), zero out elements and multiply elements by $1 / (1 - p)$. The latter is needed for keeping mean values of features close to mean values which will be in test mode. When testing this module should implement identity transform i.e. `self.output = input`.

- input:   **`batch_size x n_feats`**
- output: **`batch_size x n_feats`**

In [None]:
class Dropout(Module):
    def __init__(self, p=0.5):
        super(Dropout, self).__init__()
        
        self.p = p
        self.mask = None
        
    def updateOutput(self, input):
        # Your code goes here. ################################################
        
        if(self.training):
            #
        else:
            self.output = input
        return  self.output
    
    def updateGradInput(self, input, gradOutput):
        # Your code goes here. ################################################
        if(self.training):
            #
        else:
            self.gradInput = 1 * gradOutput
        return self.gradInput
        
    def __repr__(self):
        return "Dropout"

In [None]:
def test_Dropout():
    np.random.seed(42)

    batch_size, n_in = 2, 4
    for _ in range(100):
        # layers initialization
        p = np.random.uniform(0.3, 0.7)
        layer = Dropout(p)
        layer.train()

        layer_input = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)
        next_layer_grad = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)

        # 1. check layer output
        layer_output = layer.updateOutput(layer_input)
        assertTrue(np.all(np.logical_or(np.isclose(layer_output, 0), 
                                    np.isclose(layer_output*(1.-p), layer_input))))

        # 2. check layer input grad
        layer_grad = layer.updateGradInput(layer_input, next_layer_grad)
        assertTrue(np.all(np.logical_or(np.isclose(layer_grad, 0), 
                                    np.isclose(layer_grad*(1.-p), next_layer_grad))))

        # 3. check evaluation mode
        layer.evaluate()
        layer_output = layer.updateOutput(layer_input)
        assertAlmostEqual(layer_output, layer_input)

        # 4. check mask
        p = 0.0
        layer = Dropout(p)
        layer.train()
        layer_output = layer.updateOutput(layer_input)
        assertAlmostEqual(layer_output, layer_input)

        p = 0.5
        layer = Dropout(p)
        layer.train()
        layer_input = np.random.uniform(5, 10, (batch_size, n_in)).astype(np.float32)
        next_layer_grad = np.random.uniform(5, 10, (batch_size, n_in)).astype(np.float32)
        layer_output = layer.updateOutput(layer_input)
        zeroed_elem_mask = np.isclose(layer_output, 0)
        layer_grad = layer.updateGradInput(layer_input, next_layer_grad)        
        assertTrue(np.all(zeroed_elem_mask == np.isclose(layer_grad, 0)))

        # 5. dropout mask should be generated independently for every input matrix element, not for row/column
        batch_size, n_in = 1000, 1
        p = 0.8
        layer = Dropout(p)
        layer.train()

        layer_input = np.random.uniform(5, 10, (batch_size, n_in)).astype(np.float32)
        layer_output = layer.updateOutput(layer_input)
        assertTrue(np.sum(np.isclose(layer_output, 0)) != layer_input.size)

        layer_input = layer_input.T
        layer_output = layer.updateOutput(layer_input)
        assertTrue(np.sum(np.isclose(layer_output, 0)) != layer_input.size)
        
# test_Dropout()

# Activation functions

Here's the complete example for the **Rectified Linear Unit** non-linearity (aka **ReLU**): 

In [None]:
class ReLU(Module):
    def __init__(self):
         super(ReLU, self).__init__()
    
    def updateOutput(self, input):
        self.output = np.maximum(input, 0)
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        self.gradInput = np.multiply(gradOutput , input > 0)
        return self.gradInput
    
    def __repr__(self):
        return "ReLU"

## 6. Leaky ReLU
Implement [**Leaky Rectified Linear Unit**](http://en.wikipedia.org/wiki%2FRectifier_%28neural_networks%29%23Leaky_ReLUs). Expriment with slope. 

In [None]:
class LeakyReLU(Module):
    def __init__(self, slope = 0.03):
        super(LeakyReLU, self).__init__()
            
        self.slope = slope
        
    def updateOutput(self, input):
        # Your code goes here. ################################################
        output = input
        if (input <= 0):
            output = np.multiply(self.slope, output)
        return  self.output
    
    def updateGradInput(self, input, gradOutput):
        # Your code goes here. ################################################
        relu_grad = self.slope
        if (input > 0):
            relu_grad = 1
        self.gradInput = np.multiply(grad_output,relu_grad) 
        return self.gradInput
    
    def __repr__(self):
        return "LeakyReLU"
    

In [None]:
def test_LeakyReLU():
    np.random.seed(42)
    torch.manual_seed(42)

    batch_size, n_in = 2, 4
    for _ in range(100):
        # layers initialization
        slope = np.random.uniform(0.01, 0.05)
        torch_layer = torch.nn.LeakyReLU(slope)
        custom_layer = LeakyReLU(slope)

        layer_input = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)
        next_layer_grad = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)

        # 1. check layer output
        custom_layer_output = custom_layer.updateOutput(layer_input)
        layer_input_var = Variable(torch.from_numpy(layer_input), requires_grad=True)
        torch_layer_output_var = torch_layer(layer_input_var)
        assertAlmostEqual(torch_layer_output_var.data.numpy(), custom_layer_output, atol=1e-6)

        # 2. check layer input grad
        custom_layer_grad = custom_layer.updateGradInput(layer_input, next_layer_grad)
        torch_layer_output_var.backward(torch.from_numpy(next_layer_grad))
        torch_layer_grad_var = layer_input_var.grad
        assertAlmostEqual(torch_layer_grad_var.data.numpy(), custom_layer_grad, atol=1e-6)
        
# test_LeakyReLU()

## 7. ELU
Implement [**Exponential Linear Units**](http://arxiv.org/abs/1511.07289) activations.

In [None]:
class ELU(Module):
    def __init__(self, alpha = 1.0):
        super(ELU, self).__init__()
        
        self.alpha = alpha
        
    def updateOutput(self, input):
        self.output = input
        if (input <= 0):
            self.output = np.multiply(self.alpha, np.expm1(input))
        # Your code goes here. ################################################
        return  self.output
    
    def updateGradInput(self, input, gradOutput):
        relu_grad = 1
        if (input <= 0):
            relu_grad = np.multiply(self.alpha, np.expm1(input))
        self.gradInput = np.multiply(grad_output,relu_grad) 
        # Your code goes here. ################################################
        return self.gradInput
    
    def __repr__(self):
        return "ELU"

In [None]:
def test_ELU():
    np.random.seed(42)
    torch.manual_seed(42)

    batch_size, n_in = 2, 4
    for _ in range(100):
        # layers initialization
        alpha = 1.0
        torch_layer = torch.nn.ELU(alpha)
        custom_layer = ELU(alpha)

        layer_input = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)
        next_layer_grad = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)

        # 1. check layer output
        custom_layer_output = custom_layer.updateOutput(layer_input)
        layer_input_var = Variable(torch.from_numpy(layer_input), requires_grad=True)
        torch_layer_output_var = torch_layer(layer_input_var)
        assertAlmostEqual(torch_layer_output_var.data.numpy(), custom_layer_output, atol=1e-6)

        # 2. check layer input grad
        custom_layer_grad = custom_layer.updateGradInput(layer_input, next_layer_grad)
        torch_layer_output_var.backward(torch.from_numpy(next_layer_grad))
        torch_layer_grad_var = layer_input_var.grad
        assertAlmostEqual(torch_layer_grad_var.data.numpy(), custom_layer_grad, atol=1e-6)
        
# test_ELU()

## 8. SoftPlus
Implement [**SoftPlus**](https://en.wikipedia.org/wiki%2FRectifier_%28neural_networks%29) activations. Look, how they look a lot like ReLU.

In [None]:
class SoftPlus(Module):
    def __init__(self):
        super(SoftPlus, self).__init__()
    
    def updateOutput(self, input):
        # ln(1 + e^x)
        self.output = np.log1p(numpy.exp(input))   
        return  self.output
    
    def updateGradInput(self, input, gradOutput):
        #1 / 1 + e^-x
        relu_grad = np.divide(1, 1 + np.exp(-input))
        self.gradInput = gradOutput * relu_grad 
        return self.gradInput
    
    def __repr__(self):
        return "SoftPlus"

In [None]:
def test_SoftPlus():
    np.random.seed(42)
    torch.manual_seed(42)

    batch_size, n_in = 2, 4
    for _ in range(100):
        # layers initialization
        torch_layer = torch.nn.Softplus()
        custom_layer = SoftPlus()

        layer_input = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)
        next_layer_grad = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)

        # 1. check layer output
        custom_layer_output = custom_layer.updateOutput(layer_input)
        layer_input_var = Variable(torch.from_numpy(layer_input), requires_grad=True)
        torch_layer_output_var = torch_layer(layer_input_var)
        assertAlmostEqual(torch_layer_output_var.data.numpy(), custom_layer_output, atol=1e-6)

        # 2. check layer input grad
        custom_layer_grad = custom_layer.updateGradInput(layer_input, next_layer_grad)
        torch_layer_output_var.backward(torch.from_numpy(next_layer_grad))
        torch_layer_grad_var = layer_input_var.grad
        assertAlmostEqual(np.allclose(torch_layer_grad_var.data.numpy(), custom_layer_grad, atol=1e-6))
        
# test_SoftPlus()

# Criterions

Criterions are used to score the models answers. 

In [None]:
class Criterion(object):
    def __init__ (self):
        self.output = None
        self.gradInput = None
        
    def forward(self, input, target):
        """
            Given an input and a target, compute the loss function 
            associated to the criterion and return the result.
            
            For consistency this function should not be overrided,
            all the code goes in `updateOutput`.
        """
        return self.updateOutput(input, target)

    def backward(self, input, target):
        """
            Given an input and a target, compute the gradients of the loss function
            associated to the criterion and return the result. 

            For consistency this function should not be overrided,
            all the code goes in `updateGradInput`.
        """
        return self.updateGradInput(input, target)
    
    def updateOutput(self, input, target):
        """
        Function to override.
        """
        return self.output

    def updateGradInput(self, input, target):
        """
        Function to override.
        """
        return self.gradInput   

    def __repr__(self):
        """
        Pretty printing. Should be overrided in every module if you want 
        to have readable description. 
        """
        return "Criterion"

The **MSECriterion**, which is basic L2 norm usually used for regression, is implemented here for you.
- input:   **`batch_size x n_feats`**
- target: **`batch_size x n_feats`**
- output: **scalar**

In [None]:
class MSECriterion(Criterion):
    def __init__(self):
        super(MSECriterion, self).__init__()
        
    def updateOutput(self, input, target):   
        self.output = np.sum(np.power(input - target,2)) / input.shape[0]
        return self.output 
 
    def updateGradInput(self, input, target):
        self.gradInput  = (input - target) * 2 / input.shape[0]
        return self.gradInput

    def __repr__(self):
        return "MSECriterion"

## 9. Negative LogLikelihood criterion (numerically unstable)
You task is to implement the **ClassNLLCriterion**. It should implement [multiclass log loss](http://scikit-learn.org/stable/modules/model_evaluation.html#log-loss). Nevertheless there is a sum over `y` (target) in that formula, 
remember that targets are one-hot encoded. This fact simplifies the computations a lot. Note, that criterions are the only places, where you divide by batch size. Also there is a small hack with adding small number to probabilities to avoid computing log(0).
- input:   **`batch_size x n_feats`** - probabilities
- target: **`batch_size x n_feats`** - one-hot representation of ground truth
- output: **scalar**



In [None]:
class ClassNLLCriterionUnstable(Criterion):
    EPS = 1e-15
    def __init__(self):
        a = super(ClassNLLCriterionUnstable, self)
        super(ClassNLLCriterionUnstable, self).__init__()
        
    def updateOutput(self, input, target): 
        
        # Use this trick to avoid numerical errors
        input_clamp = np.clip(input, self.EPS, 1 - self.EPS)
        
        # Your code goes here. ################################################
        return self.output

    def updateGradInput(self, input, target):
        
        # Use this trick to avoid numerical errors
        input_clamp = np.clip(input, self.EPS, 1 - self.EPS)
                
        # Your code goes here. ################################################
        return self.gradInput
    
    def __repr__(self):
        return "ClassNLLCriterionUnstable"

In [None]:
def test_ClassNLLCriterionUnstable():
    np.random.seed(42)
    torch.manual_seed(42)

    batch_size, n_in = 2, 4
    for i in range(100):
        print(f"Iter {i}")
        # layers initialization
        torch_layer = torch.nn.NLLLoss()
        custom_layer = ClassNLLCriterionUnstable()

        layer_input = np.random.uniform(0, 1, (batch_size, n_in)).astype(np.float32)
        layer_input /= layer_input.sum(axis=-1, keepdims=True)
        layer_input = layer_input.clip(custom_layer.EPS, 1. - custom_layer.EPS)  # unifies input
        target_labels = np.random.choice(n_in, batch_size)
        target = np.zeros((batch_size, n_in), np.float32)
        target[np.arange(batch_size), target_labels] = 1  # one-hot encoding

        # 1. check layer output
        custom_layer_output = custom_layer.updateOutput(layer_input, target)
        layer_input_var = Variable(torch.from_numpy(layer_input), requires_grad=True)
        torch_layer_output_var = torch_layer(torch.log(layer_input_var), 
                                             Variable(torch.from_numpy(target_labels), requires_grad=False))
        assertAlmostEqual(torch_layer_output_var.data.numpy(), custom_layer_output, msg="1. check layer output")

        # 2. check layer input grad
        custom_layer_grad = custom_layer.updateGradInput(layer_input, target)
        torch_layer_output_var.backward()
        torch_layer_grad_var = layer_input_var.grad
        assertAlmostEqual(torch_layer_grad_var.data.numpy(), custom_layer_grad, msg='2. check layer input grad')
        
test_ClassNLLCriterionUnstable()

## 10. Negative LogLikelihood criterion (numerically stable)
- input:   **`batch_size x n_feats`** - log probabilities
- target: **`batch_size x n_feats`** - one-hot representation of ground truth
- output: **scalar**

Task is similar to the previous one, but now the criterion input is the output of log-softmax layer. This decomposition allows us to avoid problems with computation of forward and backward of log().

In [None]:
class ClassNLLCriterion(Criterion):
    def __init__(self):
        a = super(ClassNLLCriterion, self)
        super(ClassNLLCriterion, self).__init__()
        
    def updateOutput(self, input, target): 
        # Your code goes here. ################################################
        return self.output

    def updateGradInput(self, input, target):
        # Your code goes here. ################################################
        return self.gradInput
    
    def __repr__(self):
        return "ClassNLLCriterion"

In [None]:
def test_ClassNLLCriterion():
    np.random.seed(42)
    torch.manual_seed(42)

    batch_size, n_in = 2, 4
    for i in range(100):
        print(f"test_ClassNLLCriterion. Iter {i}")
        # layers initialization
        torch_layer = torch.nn.NLLLoss()
        custom_layer = ClassNLLCriterion()

        layer_input = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)
        layer_input = torch.nn.LogSoftmax(dim=1)(Variable(torch.from_numpy(layer_input))).data.numpy()
        target_labels = np.random.choice(n_in, batch_size)
        target = np.zeros((batch_size, n_in), np.float32)
        target[np.arange(batch_size), target_labels] = 1  # one-hot encoding

        # 1. check layer output
        custom_layer_output = custom_layer.updateOutput(layer_input, target)
        layer_input_var = Variable(torch.from_numpy(layer_input), requires_grad=True)
        torch_layer_output_var = torch_layer(layer_input_var, 
                                             Variable(torch.from_numpy(target_labels), requires_grad=False))
        assertAlmostEqual(torch_layer_output_var.data.numpy(), custom_layer_output, msg='1. check layer output')

        # 2. check layer input grad
        custom_layer_grad = custom_layer.updateGradInput(layer_input, target)
        torch_layer_output_var.backward()
        torch_layer_grad_var = layer_input_var.grad
        assertAlmostEqual(torch_layer_grad_var.data.numpy(), custom_layer_grad, msg='2. check layer input grad')
        
test_ClassNLLCriterion()

# Optimizers

### SGD optimizer with momentum
- `variables` - list of lists of variables (one list per layer)
- `gradients` - list of lists of current gradients (same structure as for `variables`, one array for each var)
- `config` - dict with optimization parameters (`learning_rate` and `momentum`)
- `state` - dict with optimizator state (used to save accumulated gradients)

In [None]:
def sgd_momentum(variables, gradients, config, state):  
    # 'variables' and 'gradients' have complex structure, accumulated_grads will be stored in a simpler one
    state.setdefault('accumulated_grads', {})
    
    var_index = 0 
    for current_layer_vars, current_layer_grads in zip(variables, gradients): 
        for current_var, current_grad in zip(current_layer_vars, current_layer_grads):
            
            old_grad = state['accumulated_grads'].setdefault(var_index, np.zeros_like(current_grad))
            
            np.add(config['momentum'] * old_grad, config['learning_rate'] * current_grad, out=old_grad)
            
            current_var -= old_grad
            var_index += 1     

## 11. [Adam](https://arxiv.org/pdf/1412.6980.pdf) optimizer
- `variables` - list of lists of variables (one list per layer)
- `gradients` - list of lists of current gradients (same structure as for `variables`, one array for each var)
- `config` - dict with optimization parameters (`learning_rate`, `beta1`, `beta2`, `epsilon`)
- `state` - dict with optimizator state (used to save 1st and 2nd moment for vars)

Formulas for optimizer:

Current step learning rate: $$\text{lr}_t = \text{learning_rate} * \frac{\sqrt{1-\beta_2^t}} {1-\beta_1^t}$$
First moment of var: $$\mu_t = \beta_1 * \mu_{t-1} + (1 - \beta_1)*g$$ 
Second moment of var: $$v_t = \beta_2 * v_{t-1} + (1 - \beta_2)*g*g$$
New values of var: $$\text{variable} = \text{variable} - \text{lr}_t * \frac{m_t}{\sqrt{v_t} + \epsilon}$$

In [None]:
def adam_optimizer(variables, gradients, config, state):  
    # 'variables' and 'gradients' have complex structure, accumulated_grads will be stored in a simpler one
    state.setdefault('m', {})  # first moment vars
    state.setdefault('v', {})  # second moment vars
    state.setdefault('t', 0)   # timestamp
    state['t'] += 1
    for k in ['learning_rate', 'beta1', 'beta2', 'epsilon']:
        assert k in config, config.keys()
    
    var_index = 0 
    lr_t = config['learning_rate'] * np.sqrt(1 - config['beta2']**state['t']) / (1 - config['beta1']**state['t'])
    for current_layer_vars, current_layer_grads in zip(variables, gradients): 
        for current_var, current_grad in zip(current_layer_vars, current_layer_grads):
            var_first_moment = state['m'].setdefault(var_index, np.zeros_like(current_grad))
            var_second_moment = state['v'].setdefault(var_index, np.zeros_like(current_grad))
            
            # <YOUR CODE> #######################################
            # update `current_var_first_moment`, `var_second_moment` and `current_var` values
            #np.add(... , out=var_first_moment)
            #np.add(... , out=var_second_moment)
            #current_var -= ...
            
            # small checks that you've updated the state; use np.add for rewriting np.arrays values
            assert var_first_moment is state['m'].get(var_index)
            assert var_second_moment is state['v'].get(var_index)
            var_index += 1


In [None]:
def test_adam_optimizer():
    state = {}  
    config = {'learning_rate': 1e-3, 'beta1': 0.9, 'beta2':0.999, 'epsilon':1e-8}
    variables = [[np.arange(10).astype(np.float64)]]
    gradients = [[np.arange(10).astype(np.float64)]]
    adam_optimizer(variables, gradients, config, state)
    
    assertAlmostEqual(state['m'][0], np.array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]))
    assertAlmostEqual(state['v'][0], np.array([0., 0.001, 0.004, 0.009, 0.016, 0.025, 0.036, 0.049, 0.064, 0.081]))
    assertTrue(state['t'] == 1)
    assertAlmostEqual(variables[0][0], np.array([0., 0.999, 1.999, 2.999, 3.999, 4.999, 5.999, 6.999, 7.999, 8.999]))
    adam_optimizer(variables, gradients, config, state)
    assertAlmostEqual(state['m'][0], np.array([0., 0.19, 0.38, 0.57, 0.76, 0.95, 1.14, 1.33, 1.52, 1.71]))
    assertAlmostEqual(state['v'][0], np.array([0., 0.001999, 0.007996, 0.017991, 0.031984, 0.049975, 0.071964, 0.097951, 0.127936, 0.161919]))
    assertTrue(state['t'] == 2)
    assertAlmostEqual(variables[0][0], np.array([0., 0.998, 1.998, 2.998, 3.998, 4.998, 5.998, 6.998, 7.998, 8.998]))
    
# test_adam_optimizer()

# Layers for advanced track homework
You **don't need** to implement it if you are working on `homework_main-basic.ipynb`

## 12. Conv2d [Advanced]
- input:   **`batch_size x in_channels x h x w`**
- output: **`batch_size x out_channels x h x w`**

You should implement something like pytorch `Conv2d` layer with `stride=1` and zero-padding outside of image using `scipy.signal.correlate` function.

Practical notes:
- While the layer name is "convolution", the most of neural network frameworks (including tensorflow and pytorch) implement operation that is called [correlation](https://en.wikipedia.org/wiki/Cross-correlation#Cross-correlation_of_deterministic_signals) in signal processing theory. So **don't use** `scipy.signal.convolve` since it implements [convolution](https://en.wikipedia.org/wiki/Convolution#Discrete_convolution) in terms of signal processing.
- It may be convenient to use `skimage.util.pad` for zero-padding.
- It's rather ok to implement convolution over 4d array using 2 nested loops: one over batch size dimension and another one over output filters dimension
- Having troubles with understanding how to implement the layer? 
 - Check the last year video of lecture 3 (starting from ~1:14:20)
 - May the google be with you

In [None]:
import scipy as sp
import scipy.signal
import skimage

class Conv2d(Module):
    def __init__(self, in_channels, out_channels, kernel_size):
        super(Conv2d, self).__init__()
        assert kernel_size % 2 == 1, kernel_size
       
        stdv = 1./np.sqrt(in_channels)
        self.W = np.random.uniform(-stdv, stdv, size = (out_channels, in_channels, kernel_size, kernel_size))
        self.b = np.random.uniform(-stdv, stdv, size=(out_channels,))
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        
        self.gradW = np.zeros_like(self.W)
        self.gradb = np.zeros_like(self.b)
        
    def updateOutput(self, input):
        pad_size = self.kernel_size // 2
        # YOUR CODE ##############################
        # 1. zero-pad the input array
        # 2. compute convolution using scipy.signal.correlate(... , mode='valid')
        # 3. add bias value
        
        # self.output = ...
        
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        pad_size = self.kernel_size // 2
        # YOUR CODE ##############################
        # 1. zero-pad the gradOutput
        # 2. compute 'self.gradInput' value using scipy.signal.correlate(... , mode='valid')
        
        # self.gradInput = ...
        
        return self.gradInput
    
    def accGradParameters(self, input, gradOutput):
        pad_size = self.kernel_size // 2
        # YOUR CODE #############
        # 1. zero-pad the input
        # 2. compute 'self.gradW' using scipy.signal.correlate(... , mode='valid')
        # 3. compute 'self.gradb' - formulas like in Linear of ChannelwiseScaling layers
        
        # self.gradW = ...
        # self.gradb = ...
        pass
    
    def zeroGradParameters(self):
        self.gradW.fill(0)
        self.gradb.fill(0)
        
    def getParameters(self):
        return [self.W, self.b]
    
    def getGradParameters(self):
        return [self.gradW, self.gradb]
    
    def __repr__(self):
        s = self.W.shape
        q = 'Conv2d %d -> %d' %(s[1],s[0])
        return q

In [None]:
def test_Conv2d(self):
    np.random.seed(42)
    torch.manual_seed(42)

    batch_size, n_in, n_out = 2, 3, 4
    h,w = 5,6
    kern_size = 3
    for _ in range(100):
        # layers initialization
        torch_layer = torch.nn.Conv2d(n_in, n_out, kern_size, padding=1)
        custom_layer = Conv2d(n_in, n_out, kern_size)
        custom_layer.W = torch_layer.weight.data.numpy() # [n_out, n_in, kern, kern]
        custom_layer.b = torch_layer.bias.data.numpy()

        layer_input = np.random.uniform(-1, 1, (batch_size, n_in, h,w)).astype(np.float32)
        next_layer_grad = np.random.uniform(-1, 1, (batch_size, n_out, h, w)).astype(np.float32)

        # 1. check layer output
        custom_layer_output = custom_layer.updateOutput(layer_input)
        layer_input_var = Variable(torch.from_numpy(layer_input), requires_grad=True)
        torch_layer_output_var = torch_layer(layer_input_var)
        assertAlmostEqual(torch_layer_output_var.data.numpy(), custom_layer_output, atol=1e-6)

        # 2. check layer input grad
        custom_layer_grad = custom_layer.updateGradInput(layer_input, next_layer_grad)
        torch_layer_output_var.backward(torch.from_numpy(next_layer_grad))
        torch_layer_grad_var = layer_input_var.grad
        assertAlmostEqual(torch_layer_grad_var.data.numpy(), custom_layer_grad, atol=1e-6))

        # 3. check layer parameters grad
        custom_layer.accGradParameters(layer_input, next_layer_grad)
        weight_grad = custom_layer.gradW
        bias_grad = custom_layer.gradb
        torch_weight_grad = torch_layer.weight.grad.data.numpy()
        torch_bias_grad = torch_layer.bias.grad.data.numpy()
        #m = ~np.isclose(torch_weight_grad, weight_grad, atol=1e-5)
        assertAlmostEqual(torch_weight_grad, weight_grad, atol=1e-6, )
        assertAlmostEqual(torch_bias_grad, bias_grad, atol=1e-6)
        
# test_Conv2d()

## 13. MaxPool2d [Advanced]
- input:   **`batch_size x n_input_channels x h x w`**
- output: **`batch_size x n_output_channels x h // kern_size x w // kern_size`**

You are to implement simplified version of pytorch `MaxPool2d` layer with stride = kernel_size. Please note, that it's not a common case that stride = kernel_size: in AlexNet and ResNet kernel_size for max-pooling was set to 3, while stride was set to 2. We introduce this restriction to make implementation simplier.

Practical notes:
- During forward pass what you need to do is just to reshape the input tensor to `[n, c, h / kern_size, kern_size, w / kern_size, kern_size]`, swap two axes and take maximums over the last two dimensions. Reshape + axes swap is sometimes called space-to-batch transform.
- During backward pass you need to place the gradients in positions of maximal values taken during the forward pass
- In real frameworks the indices of maximums are stored in memory during the forward pass. It is cheaper than to keep the layer input in memory and recompute the maximums.

In [None]:
class MaxPool2d(Module):
    def __init__(self, kernel_size):
        super(MaxPool2d, self).__init__()
        self.kernel_size = kernel_size
        self.gradInput = None
                    
    def updateOutput(self, input):
        input_h, input_w = input.shape[-2:]
        # your may remove these asserts and implement MaxPool2d with padding
        assert input_h % self.kernel_size == 0  
        assert input_w % self.kernel_size == 0
        
        # YOUR CODE #############################
        # self.output = ...
        # self.max_indices = ...
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        # YOUR CODE #############################
        # self.gradInput = ...
        return self.gradInput
    
    def __repr__(self):
        q = 'MaxPool2d, kern %d, stride %d' %(self.kernel_size, self.kernel_size)
        return q

In [None]:
def test_MaxPool2d():
    np.random.seed(42)
    torch.manual_seed(42)

    batch_size, n_in = 2, 3
    h,w = 4,6
    kern_size = 2
    for _ in range(100):
        # layers initialization
        torch_layer = torch.nn.MaxPool2d(kern_size)
        custom_layer = MaxPool2d(kern_size)

        layer_input = np.random.uniform(-10, 10, (batch_size, n_in, h,w)).astype(np.float32)
        next_layer_grad = np.random.uniform(-10, 10, (batch_size, n_in, 
                                                      h // kern_size, w // kern_size)).astype(np.float32)

        # 1. check layer output
        custom_layer_output = custom_layer.updateOutput(layer_input)
        layer_input_var = Variable(torch.from_numpy(layer_input), requires_grad=True)
        torch_layer_output_var = torch_layer(layer_input_var)
        assertAlmostEqual(torch_layer_output_var.data.numpy(), custom_layer_output, atol=1e-6)

        # 2. check layer input grad
        custom_layer_grad = custom_layer.updateGradInput(layer_input, next_layer_grad)
        torch_layer_output_var.backward(torch.from_numpy(next_layer_grad))
        torch_layer_grad_var = layer_input_var.grad
        assertAlmostEqual(torch_layer_grad_var.data.numpy(), custom_layer_grad, atol=1e-6)
        
test_MaxPool2d()

### Flatten layer
Just reshapes inputs and gradients. It's usually used as proxy layer between Conv2d and Linear.

In [None]:
class Flatten(Module):
    def __init__(self):
         super(Flatten, self).__init__()
    
    def updateOutput(self, input):
        self.output = input.reshape(len(input), -1)
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        self.gradInput = gradOutput.reshape(input.shape)
        return self.gradInput
    
    def __repr__(self):
        return "Flatten"