In [7]:
import numpy as np

**Module** is an abstract class which defines fundamental methods necessary for a training a neural network. You do not need to change anything here, just read the comments.

In [10]:
class Module(object):
    """
    Basically, you can think of a module as of a something (black box)
    which can process `input` data and produce `ouput` data.
    This is like applying a function which is called `forward`:

        output = module.forward(input)

    The module should be able to perform a backward pass: to differentiate the `forward` function.
    More, it should be able to differentiate it if is a part of chain (chain rule).
    The latter implies there is a gradient from previous step of a chain rule.

        gradInput = module.backward(input, gradOutput)
    """
    def __init__ (self):
        self.output = None
        self.gradInput = None
        self.training = True

    def forward(self, input):
        """
        Takes an input object, and computes the corresponding output of the module.
        """
        return self.updateOutput(input)

    def backward(self,input, gradOutput):
        """
        Performs a backpropagation step through the module, with respect to the given input.

        This includes
         - computing a gradient w.r.t. `input` (is needed for further backprop),
         - computing a gradient w.r.t. parameters (to update parameters while optimizing).
        """
        self.updateGradInput(input, gradOutput)
        self.accGradParameters(input, gradOutput)
        return self.gradInput


    def updateOutput(self, input):
        """
        Computes the output using the current parameter set of the class and input.
        This function returns the result which is stored in the `output` field.

        Make sure to both store the data in `output` field and return it.
        """

        # The easiest case:

        # self.output = input
        # return self.output

        pass

    def updateGradInput(self, input, gradOutput):
        """
        Computing the gradient of the module with respect to its own input.
        This is returned in `gradInput`. Also, the `gradInput` state variable is updated accordingly.

        The shape of `gradInput` is always the same as the shape of `input`.

        Make sure to both store the gradients in `gradInput` field and return it.
        """

        # The easiest case:

        # self.gradInput = gradOutput
        # return self.gradInput

        pass

    def accGradParameters(self, input, gradOutput):
        """
        Computing the gradient of the module with respect to its own parameters.
        No need to override if module has no parameters (e.g. ReLU).
        """
        pass

    def zeroGradParameters(self):
        """
        Zeroes `gradParams` variable if the module has params.
        """
        pass

    def getParameters(self):
        """
        Returns a list with its parameters.
        If the module does not have parameters return empty list.
        """
        return []

    def getGradParameters(self):
        """
        Returns a list with gradients with respect to its parameters.
        If the module does not have parameters return empty list.
        """
        return []

    def train(self):
        """
        Sets training mode for the module.
        Training and testing behaviour differs for Dropout, BatchNorm.
        """
        self.training = True

    def evaluate(self):
        """
        Sets evaluation mode for the module.
        Training and testing behaviour differs for Dropout, BatchNorm.
        """
        self.training = False

    def __repr__(self):
        """
        Pretty printing. Should be overrided in every module if you want
        to have readable description.
        """
        return "Module"

# Sequential container

**Define** a forward and backward pass procedures.

In [11]:
class Sequential(Module):
    """
         This class implements a container, which processes `input` data sequentially.

         `input` is processed by each module (layer) in self.modules consecutively.
         The resulting array is called `output`.
    """

    def __init__ (self):
        super(Sequential, self).__init__()
        self.modules = []

    def add(self, module):
        """
        Adds a module to the container.
        """
        self.modules.append(module)

    def updateOutput(self, input):
        """
        Basic workflow of FORWARD PASS:

            y_0    = module[0].forward(input)
            y_1    = module[1].forward(y_0)
            ...
            output = module[n-1].forward(y_{n-2})


        Just write a little loop.
        """
        self.output = input
        for module in self.modules:
            self.output = module.forward(self.output)

        # Your code goes here. ################################################
        return self.output

    def backward(self, input, gradOutput):
        """
        Workflow of BACKWARD PASS:

            g_{n-1} = module[n-1].backward(y_{n-2}, gradOutput)
            g_{n-2} = module[n-2].backward(y_{n-3}, g_{n-1})
            ...
            g_1 = module[1].backward(y_0, g_2)
            gradInput = module[0].backward(input, g_1)


        !!!

        To ech module you need to provide the input, module saw while forward pass,
        it is used while computing gradients.
        Make sure that the input for `i-th` layer the output of `module[i]` (just the same input as in forward pass)
        and NOT `input` to this Sequential module.

        !!!

        """
        self.gradInput = gradOutput
        for i in reversed(range(len(self.modules))):
            prev_input = self.modules[i - 1].output if i > 0 else input
            self.gradInput = self.modules[i].backward(prev_input, self.gradInput)
            self.modules[i].accGradParameters(prev_input, self.gradInput)
        # Your code goes here. ################################################
        return self.gradInput


    def zeroGradParameters(self):
        for module in self.modules:
            module.zeroGradParameters()

    def getParameters(self):
        """
        Should gather all parameters in a list.
        """
        return [x.getParameters() for x in self.modules]

    def getGradParameters(self):
        """
        Should gather all gradients w.r.t parameters in a list.
        """
        return [x.getGradParameters() for x in self.modules]

    def __repr__(self):
        string = "".join([str(x) + '\n' for x in self.modules])
        return string

    def __getitem__(self,x):
        return self.modules.__getitem__(x)

    def train(self):
        """
        Propagates training parameter through all modules
        """
        self.training = True
        for module in self.modules:
            module.train()

    def evaluate(self):
        """
        Propagates training parameter through all modules
        """
        self.training = False
        for module in self.modules:
            module.evaluate()

# Layers

## 1 (0.2). Linear transform layer
Also known as dense layer, fully-connected layer, FC-layer, InnerProductLayer (in caffe), affine transform
- input:   **`batch_size x n_feats1`**
- output: **`batch_size x n_feats2`**

In [12]:
class Linear(Module):
    """
    A module which applies a linear transformation #Модуль, который применяет линейное преобразование
    A common name is fully-connected layer, InnerProductLayer in caffe. #Распространенное название — полностью связанный слой, InnerProductLayer в Caffe.

    The module should work with 2D input of shape (n_samples, n_feature). #Модуль должен работать с 2D-входом формы (n_samples, n_feature).
    """
    def __init__(self, n_in, n_out):
        super(Linear, self).__init__()

        # This is a nice initialization
        stdv = 1./np.sqrt(n_in)
        self.W = np.random.uniform(-stdv, stdv, size = (n_out, n_in))
        self.b = np.random.uniform(-stdv, stdv, size = n_out)

        self.gradW = np.zeros_like(self.W)
        self.gradb = np.zeros_like(self.b)

    def updateOutput(self, input):
        # Your code goes here. ################################################
        self.output = np.dot(input, self.W.T) + self.b
        return self.output

    def updateGradInput(self, input, gradOutput):
        # Your code goes here. ################################################
        self.gradInput = np.dot(gradOutput, self.W)
        return self.gradInput

    def accGradParameters(self, input, gradOutput):
        # Your code goes here. ################################################
        self.gradW = self.gradW + np.dot(gradOutput.T, input)
        self.gradb = self.gradb + np.sum(gradOutput, axis=0)
        # self.gradW = ... ; self.gradb = ...
        pass

    def zeroGradParameters(self):
        self.gradW.fill(0)
        self.gradb.fill(0)

    def getParameters(self):
        return [self.W, self.b]

    def getGradParameters(self):
        return [self.gradW, self.gradb]

    def __repr__(self):
        s = self.W.shape
        q = 'Linear %d -> %d' %(s[1],s[0])
        return q

## 2. (0.2) SoftMax
- input:   **`batch_size x n_feats`**
- output: **`batch_size x n_feats`**

$\text{softmax}(x)_i = \frac{\exp x_i} {\sum_j \exp x_j}$

Recall that $\text{softmax}(x) == \text{softmax}(x - \text{const})$. It makes possible to avoid computing exp() from large argument.

In [13]:
class SoftMax(Module):
    def __init__(self):
         super(SoftMax, self).__init__()

    def updateOutput(self, input):
        # start with normalization for numerical stability
        self.output = np.subtract(input, input.max(axis=1, keepdims=True))
        input_exp = np.exp(np.subtract(input, np.max(input, axis=1, keepdims=True)))
        self.output = input_exp / np.sum(input_exp, axis=1, keepdims=True)

        # Your code goes here. ################################################
        return self.output

    def updateGradInput(self, input, gradOutput):
        batch_size = self.output.shape[0]
        self.gradInput = np.empty_like(self.output)

        for i in range(batch_size):
            softmax_i = self.output[i].reshape(-1, 1)
            jacobian = np.diagflat(softmax_i) - np.dot(softmax_i, softmax_i.T)
            self.gradInput[i] = np.dot(jacobian, gradOutput[i])
        # Your code goes here. ################################################
        return self.gradInput

    def __repr__(self):
        return "SoftMax"

## 3. (0.2) LogSoftMax
- input:   **`batch_size x n_feats`**
- output: **`batch_size x n_feats`**

$\text{logsoftmax}(x)_i = \log\text{softmax}(x)_i = x_i - \log {\sum_j \exp x_j}$

The main goal of this layer is to be used in computation of log-likelihood loss.

In [14]:
class LogSoftMax(Module):
    def __init__(self):
         super(LogSoftMax, self).__init__()

    def updateOutput(self, input):
        # start with normalization for numerical stability
        self.output = np.subtract(input, input.max(axis=1, keepdims=True))
        logsumexp = np.log(np.sum(np.exp(np.subtract(input, np.max(input, axis=1, keepdims=True))), axis=1, keepdims=True))
        self.output = input - np.max(input, axis=1, keepdims=True) - logsumexp

        # Your code goes here. ################################################
        return self.output

    def updateGradInput(self, input, gradOutput):
        softmax = np.exp(self.output)
        self.gradInput = gradOutput - softmax * np.sum(gradOutput, axis=1, keepdims=True)
        # Your code goes here. ################################################
        return self.gradInput

    def __repr__(self):
        return "LogSoftMax"

## 4. (0.3) Batch normalization
One of the most significant recent ideas that impacted NNs a lot is [**Batch normalization**](http://arxiv.org/abs/1502.03167). The idea is simple, yet effective: the features should be whitened ($mean = 0$, $std = 1$) all the way through NN. This improves the convergence for deep models letting it train them for days but not weeks. **You are** to implement the first part of the layer: features normalization. The second part (`ChannelwiseScaling` layer) is implemented below.

- input:   **`batch_size x n_feats`**
- output: **`batch_size x n_feats`**

The layer should work as follows. While training (`self.training == True`) it transforms input as $$y = \frac{x - \mu}  {\sqrt{\sigma + \epsilon}}$$
where $\mu$ and $\sigma$ - mean and variance of feature values in **batch** and $\epsilon$ is just a small number for numericall stability. Also during training, layer should maintain exponential moving average values for mean and variance:
```
    self.moving_mean = self.moving_mean * alpha + batch_mean * (1 - alpha)
    self.moving_variance = self.moving_variance * alpha + batch_variance * (1 - alpha)
```
During testing (`self.training == False`) the layer normalizes input using moving_mean and moving_variance.

Note that decomposition of batch normalization on normalization itself and channelwise scaling here is just a common **implementation** choice. In general "batch normalization" always assumes normalization + scaling.

In [15]:
class BatchNormalization(Module):
    EPS = 1e-3
    def __init__(self, alpha = 0.):
        super(BatchNormalization, self).__init__()
        self.alpha = alpha
        self.moving_mean = None
        self.moving_variance = None

    def updateOutput(self, input):
        # Your code goes here. ################################################
        if self.moving_mean is None:
            self.moving_mean = np.zeros(input.shape[1])
            self.moving_variance = np.ones(input.shape[1])

        if self.training:
            batch_mean = np.mean(input, axis=0)
            batch_variance = np.var(input, axis=0)

            self.moving_mean = self.alpha * self.moving_mean + (1 - self.alpha) * batch_mean
            self.moving_variance = self.alpha * self.moving_variance + (1 - self.alpha) * batch_variance

            self.output = (input - batch_mean) / np.sqrt(batch_variance + self.EPS)
        else:
            self.output = (input - self.moving_mean) / np.sqrt(self.moving_variance + self.EPS)
        # use self.EPS please
        return self.output

    def updateGradInput(self, input, gradOutput):
        batch_size = input.shape[0]
        mean = np.mean(input, axis=0)
        variance = np.var(input, axis=0)
        std_inv = 1.0 / np.sqrt(variance + self.EPS)

        d_var = np.sum(gradOutput * (input - mean) * -0.5 * std_inv**3, axis=0)
        d_mean = np.sum(gradOutput * -std_inv, axis=0) + d_var * np.mean(-2.0 * (input - mean), axis=0)

        self.gradInput = gradOutput * std_inv + (d_var * 2.0 * (input - mean) + d_mean) / batch_size
        # Your code goes here. ################################################
        return self.gradInput

    def __repr__(self):
        return "BatchNormalization"

In [16]:
class ChannelwiseScaling(Module):
    """
       Implements linear transform of input y = \gamma * x + \beta
       where \gamma, \beta - learnable vectors of length x.shape[-1]
    """
    def __init__(self, n_out):
        super(ChannelwiseScaling, self).__init__()

        stdv = 1./np.sqrt(n_out)
        self.gamma = np.random.uniform(-stdv, stdv, size=n_out)
        self.beta = np.random.uniform(-stdv, stdv, size=n_out)

        self.gradGamma = np.zeros_like(self.gamma)
        self.gradBeta = np.zeros_like(self.beta)

    def updateOutput(self, input):
        self.output = input * self.gamma + self.beta
        return self.output

    def updateGradInput(self, input, gradOutput):
        self.gradInput = gradOutput * self.gamma
        return self.gradInput

    def accGradParameters(self, input, gradOutput):
        self.gradBeta = np.sum(gradOutput, axis=0)
        self.gradGamma = np.sum(gradOutput*input, axis=0)

    def zeroGradParameters(self):
        self.gradGamma.fill(0)
        self.gradBeta.fill(0)

    def getParameters(self):
        return [self.gamma, self.beta]

    def getGradParameters(self):
        return [self.gradGamma, self.gradBeta]

    def __repr__(self):
        return "ChannelwiseScaling"

Practical notes. If BatchNormalization is placed after a linear transformation layer (including dense layer, convolutions, channelwise scaling) that implements function like `y = weight * x + bias`, than bias adding become useless and could be omitted since its effect will be discarded while batch mean subtraction. If BatchNormalization (followed by `ChannelwiseScaling`) is placed before a layer that propagates scale (including ReLU, LeakyReLU) followed by any linear transformation layer than parameter `gamma` in `ChannelwiseScaling` could be freezed since it could be absorbed into the linear transformation layer.

## 5. (0.3) Dropout
Implement [**dropout**](https://www.cs.toronto.edu/~hinton/absps/JMLRdropout.pdf). The idea and implementation is really simple: just multimply the input by $Bernoulli(p)$ mask. Here $p$ is probability of an element to be zeroed.

This has proven to be an effective technique for regularization and preventing the co-adaptation of neurons.

While training (`self.training == True`) it should sample a mask on each iteration (for every batch), zero out elements and multiply elements by $1 / (1 - p)$. The latter is needed for keeping mean values of features close to mean values which will be in test mode. When testing this module should implement identity transform i.e. `self.output = input`.

- input:   **`batch_size x n_feats`**
- output: **`batch_size x n_feats`**

In [18]:
class Dropout(Module):
    def __init__(self, p=0.5):
        super(Dropout, self).__init__()
        self.p = p
        self.mask = None

    def updateOutput(self, input):
        if self.training:
            self.mask = (np.random.rand(*input.shape) > self.p) / (1.0 - self.p)
            self.output = input * self.mask
        else:
            self.output = input
        # Your code goes here. ################################################
        return  self.output

    def updateGradInput(self, input, gradOutput):
        self.gradInput = gradOutput * self.mask if self.training else gradOutput
        # Your code goes here. ################################################
        return self.gradInput

    def __repr__(self):
        return "Dropout"

#6. (2.0) Conv2d
Implement [**Conv2d**](https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html). Use only this list of parameters: (in_channels, out_channels, kernel_size, stride, padding, bias, padding_mode) and fix dilation=1 and groups=1.

In [19]:
class Conv2d(Module):
    def __init__(self, in_channels, out_channels, kernel_size,
                 stride=1, padding=0, bias=True, padding_mode='zeros'):
        super(Conv2d, self).__init__()

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.padding_mode = padding_mode

        if bias:
            self.bias = np.zeros(self.out_channels)
        else:
            self.bias = None

        self.weight = np.random.randn(
            self.out_channels, self.in_channels, self.kernel_size, self.kernel_size
        ) * np.sqrt(2. / (self.in_channels * self.kernel_size * self.kernel_size))

    def fold_padding_into_edges(self, grad_input_padded, pad):
        # top padding back into first valid row
        grad_input_padded[:, :, pad, :] += np.sum(grad_input_padded[:, :, :pad, :], axis=2)
        # bottom padding
        grad_input_padded[:, :, -pad-1, :] += np.sum(grad_input_padded[:, :, -pad:, :], axis=2)
        # left padding
        grad_input_padded[:, :, :, pad] += np.sum(grad_input_padded[:, :, :, :pad], axis=3)
        # right padding
        grad_input_padded[:, :, :, -pad-1] += np.sum(grad_input_padded[:, :, :, -pad:], axis=3)
        return grad_input_padded

    def fold_padding_into_edges_offset(grad_input_padded, pad):
        b, c, h, w = grad_input_padded.shape

        # Top padding
        for i in range(pad):
            grad_input_padded[:, :, i + 1 + pad, :] += grad_input_padded[:, :, i, :]  #Складываем i-ю строку паддинга с i+1 строкой матрицы

        # Bottom padding
        for i in range(pad):
            grad_input_padded[:, :, h - 2 * pad - 1 - i, :] += grad_input_padded[:, :, h - pad + i, :]  #Складываем i-ю строку паддинга (снизу) с i+1 строкой матрицы (снизу)

        # Left padding
        for i in range(pad):
            grad_input_padded[:, :, :, i + 1 + pad] += grad_input_padded[:, :, :, i] #Складываем i-й столбец паддинга с i+1 столбцом матрицы

        # Right padding
        for i in range(pad):
            grad_input_padded[:, :, :, w - 2 * pad - 1 - i] += grad_input_padded[:, :, :, w - pad + i] #Складываем i-й столбец паддинга (справа) с i+1 столбцом матрицы (справа)


        return grad_input_padded


    def pad_input(self, input):
        kwargs = {}

        if self.padding == 0:
            return input
        elif self.padding == 'same':
            self.padding = (self.kernel_size - self.stride) // 2

        if self.padding_mode == 'zeros':
            self.padding_mode = 'constant'
            kwargs = {'constant_values': 0}
        elif self.padding_mode == 'replicate':
            self.padding_mode = 'edge'
        elif self.padding_mode == 'reflect':
            self.padding_mode = 'reflect'
        return np.pad(input, ((0, 0), (0, 0), (self.padding, self.padding), (self.padding, self.padding)), mode=self.padding_mode, **kwargs)

    def updateOutput(self, input):
        self.input = input
        B, in_C, H, W = input.shape

        padded_input = self.pad_input(self.input)
        self.padded_input = padded_input
        (stride_y, stride_x) = self.stride if isinstance(self.stride, tuple) else (self.stride, self.stride)
        outH = (H + 2 * self.padding - self.kernel_size) // stride_y + 1
        outW = (W + 2 * self.padding - self.kernel_size) // stride_x + 1
        self.output = np.zeros((B, self.out_channels, outH, outW))

        for b in range(B):
            for o in range(self.out_channels):
                for i in range(0, outH):
                    for j in range(0, outW):
                        region = padded_input[
                            b,
                            :,
                            i*stride_y:i*stride_y+self.kernel_size,
                            j*stride_x:j*stride_x+self.kernel_size]
                        self.output[b, o, i, j] = np.sum(region * self.weight[o]) + (self.bias[o] if self.bias is not None else 0)
        # Your code goes here. ################################################
        return self.output

    def updateGradInput(self, input, gradOutput):
        B, in_C, H, W = input.shape
        k = self.kernel_size
        (stride_y, stride_x) = self.stride if isinstance(self.stride, tuple) else (self.stride, self.stride)
        p = self.padding
        outH, outW = gradOutput.shape[2:]

        grad_input_padded = np.zeros((B, in_C, H + 2*p, W + 2*p))

        for b in range(B):
            for o in range(self.out_channels):
                for i in range(outH):
                    for j in range(outW):
                        grad = gradOutput[b, o, i, j]
                        grad_input_padded[b, :, i*stride_y:i*stride_y+k, j*stride_x:j*stride_x+k] += self.weight[o] * grad

        if p == 0:
            self.gradInput = grad_input_padded
        elif self.padding_mode in ['edge']:
            grad_input_padded = self.fold_padding_into_edges(grad_input_padded, p)
            self.gradInput = grad_input_padded[:, :, p:-p, p:-p]
        elif self.padding_mode in ['reflect']:
            grad_input_padded = self.fold_padding_into_reflect(grad_input_padded, p)
            self.gradInput = grad_input_padded[:, :, p:-p, p:-p]
        else:
            self.gradInput = grad_input_padded[:, :, p:-p, p:-p]
        # Your code goes here. ################################################
        return self.gradInput

    def __repr__(self):
        return "Conv2d"

#7. (0.5) Implement [**MaxPool2d**](https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html) and [**AvgPool2d**](https://pytorch.org/docs/stable/generated/torch.nn.AvgPool2d.html). Use only parameters like kernel_size, stride, padding (negative infinity for maxpool and zero for avgpool) and other parameters fixed as in framework.

In [20]:
class MaxPool2d(Module):
    def __init__(self, kernel_size, stride, padding):
        super(MaxPool2d, self).__init__()

        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding

    def updateOutput(self, input):
        batch_size, channels, height, width = input.shape
        pad = self.padding
        input_padded = np.pad(input, ((0, 0), (0, 0), (pad, pad), (pad, pad)), mode='constant')

        out_height = (height + 2 * pad - self.kernel_size) // self.stride + 1
        out_width = (width + 2 * pad - self.kernel_size) // self.stride + 1
        self.output = np.zeros((batch_size, channels, out_height, out_width))

        self.max_indices = np.zeros_like(input, dtype=int)

        for i in range(out_height):
            for j in range(out_width):
                h_start, h_end = i * self.stride, i * self.stride + self.kernel_size
                w_start, w_end = j * self.stride, j * self.stride + self.kernel_size
                region = input_padded[:, :, h_start:h_end, w_start:w_end]
                self.output[:, :, i, j] = np.max(region, axis=(2, 3))

                max_mask = (region == self.output[:, :, i, j, None, None])
                self.max_indices[:, :, h_start:h_end, w_start:w_end] = max_mask
        # Your code goes here. ################################################
        return  self.output

    def updateGradInput(self, input, gradOutput):
        batch_size, channels, height, width = input.shape
        pad = self.padding
        self.gradInput = np.zeros_like(input)

        out_height, out_width = gradOutput.shape[2], gradOutput.shape[3]

        for i in range(out_height):
            for j in range(out_width):
                h_start, h_end = i * self.stride, i * self.stride + self.kernel_size
                w_start, w_end = j * self.stride, j * self.stride + self.kernel_size

                max_mask = self.max_indices[:, :, h_start:h_end, w_start:w_end]
                self.gradInput[:, :, h_start:h_end, w_start:w_end] += max_mask * gradOutput[:, :, i, j, None, None]
        # Your code goes here. ################################################
        return self.gradInput

    def __repr__(self):
        return "MaxPool2d"
class AvgPool2d(Module):
    def __init__(self, kernel_size, stride, padding):
        super(AvgPool2d, self).__init__()

        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding

    def updateOutput(self, input):

        # Your code goes here. ################################################
        return  self.output

    def updateGradInput(self, input, gradOutput):

        # Your code goes here. ################################################
        return self.gradInput

    def __repr__(self):
        return "AvgPool2d"


#8. (0.3) Implement **GlobalMaxPool2d** and **GlobalAvgPool2d**. They do not have testing and parameters are up to you but they must aggregate information within channels. Write test functions for these layers on your own.

#9. (0.2) Implement [**Flatten**](https://pytorch.org/docs/stable/generated/torch.flatten.html)

In [21]:
class Flatten(Module):
    def __init__(self, start_dim=0, end_dim=-1):
        super(Flatten, self).__init__()
        self.start_dim = start_dim
        self.end_dim = end_dim
    def updateOutput(self, input):
        self.input_shape = input.shape
        num_dims = len(self.input_shape)
        end_dim = self.end_dim if self.end_dim != -1 else num_dims - 1
        flattened_dim = np.prod(self.input_shape[self.start_dim:end_dim + 1])
        flatten_shape = self.input_shape[:self.start_dim] + (flattened_dim,) + self.input_shape[end_dim + 1:]
        self.output = input.reshape(flatten_shape)
        # Your code goes here. ################################################
        return self.output

    def updateGradInput(self, input, gradOutput):
        self.gradInput = gradOutput.reshape(self.input_shape)
        # Your code goes here. ################################################
        return self.gradInput

    def __repr__(self):
        return "Flatten"

# Activation functions

Here's the complete example for the **Rectified Linear Unit** non-linearity (aka **ReLU**):

In [22]:
class ReLU(Module):
    def __init__(self):
         super(ReLU, self).__init__()

    def updateOutput(self, input):
        self.output = np.maximum(input, 0)
        return self.output

    def updateGradInput(self, input, gradOutput):
        self.gradInput = np.multiply(gradOutput , input > 0)
        return self.gradInput

    def __repr__(self):
        return "ReLU"

## 10. (0.1) Leaky ReLU
Implement [**Leaky Rectified Linear Unit**](http://en.wikipedia.org/wiki%2FRectifier_%28neural_networks%29%23Leaky_ReLUs). Expriment with slope.

In [23]:
class LeakyReLU(Module):
    def __init__(self, slope = 0.03):
        super(LeakyReLU, self).__init__()

        self.slope = slope

    def updateOutput(self, input):
        self.output = np.maximum(input, self.slope * input)
        # Your code goes here. ################################################
        return  self.output

    def updateGradInput(self, input, gradOutput):
        self.gradInput = np.multiply(gradOutput, input > 0) + np.multiply(gradOutput, input <= 0) * self.slope
        # Your code goes here. ################################################
        return self.gradInput

    def __repr__(self):
        return "LeakyReLU"

## 11. (0.1) ELU
Implement [**Exponential Linear Units**](http://arxiv.org/abs/1511.07289) activations.

In [18]:
class ELU(Module):
    def __init__(self, alpha = 1.0):
        super(ELU, self).__init__()

        self.alpha = alpha

    def updateOutput(self, input):
        self.output = np.where(input > 0, input, self.alpha * (np.exp(input) - 1))
        # Your code goes here. ################################################
        return  self.output

    def updateGradInput(self, input, gradOutput):
        elu_grad = np.where(input > 0, 1, self.alpha * np.exp(input))
        self.gradInput = gradOutput * elu_grad
        return self.gradInput
        # Your code goes here. ################################################
        return self.gradInput

    def __repr__(self):
        return "ELU"

## 12. (0.1) SoftPlus
Implement [**SoftPlus**](https://en.wikipedia.org/wiki%2FRectifier_%28neural_networks%29) activations. Look, how they look a lot like ReLU.

In [19]:
class SoftPlus(Module):
    def __init__(self):
        super(SoftPlus, self).__init__()

    def updateOutput(self, input):
        self.output = np.log(1 + np.exp(input))
        # Your code goes here. ################################################
        return  self.output

    def updateGradInput(self, input, gradOutput):
        self.gradInput = gradOutput / (1 + np.exp(-input))
        # Your code goes here. ################################################
        return self.gradInput

    def __repr__(self):
        return "SoftPlus"

#13. (0.2) Gelu
Implement [**Gelu**](https://pytorch.org/docs/stable/generated/torch.nn.GELU.html) activations.

In [20]:
class Gelu(Module):
    def __init__(self):
        super(Gelu, self).__init__()
    def updateOutput(self, input):
        self.input = input
        self.output = 0.5 * self.input * (1 + np.tanh(np.sqrt(2 / np.pi) * (self.input + 0.044715 * self.input ** 3)))
        return self.output

    def updateGradInput(self, input, gradOutput):
        self.input = input
        c = 0.044715
        tanh_arg = np.sqrt(2 / np.pi) * (self.input + c * self.input**3)
        tanh_val = np.tanh(tanh_arg)
        sech_sq_val = 1 / np.cosh(tanh_arg)**2
        d_tanh_arg_dx = np.sqrt(2 / np.pi) * (1 + 3 * c * self.input**2)
        d_gelu_dx = 0.5 * (1 + tanh_val) + 0.5 * self.input * sech_sq_val * d_tanh_arg_dx
        self.gradInput = gradOutput * d_gelu_dx
        return self.gradInput
    def __repr__(self):
        return "Gelu"

# Criterions

Criterions are used to score the models answers.

In [24]:
class Criterion(object):
    def __init__ (self):
        self.output = None
        self.gradInput = None

    def forward(self, input, target):
        """
            Given an input and a target, compute the loss function
            associated to the criterion and return the result.

            For consistency this function should not be overrided,
            all the code goes in `updateOutput`.
        """
        return self.updateOutput(input, target)

    def backward(self, input, target):
        """
            Given an input and a target, compute the gradients of the loss function
            associated to the criterion and return the result.

            For consistency this function should not be overrided,
            all the code goes in `updateGradInput`.
        """
        return self.updateGradInput(input, target)

    def updateOutput(self, input, target):
        """
        Function to override.
        """
        return self.output

    def updateGradInput(self, input, target):
        """
        Function to override.
        """
        return self.gradInput

    def __repr__(self):
        """
        Pretty printing. Should be overrided in every module if you want
        to have readable description.
        """
        return "Criterion"

The **MSECriterion**, which is basic L2 norm usually used for regression, is implemented here for you.
- input:   **`batch_size x n_feats`**
- target: **`batch_size x n_feats`**
- output: **scalar**

In [25]:
class MSECriterion(Criterion):
    def __init__(self):
        super(MSECriterion, self).__init__()

    def updateOutput(self, input, target):
        self.output = np.sum(np.power(input - target,2)) / input.shape[0]
        return self.output

    def updateGradInput(self, input, target):
        self.gradInput  = (input - target) * 2 / input.shape[0]
        return self.gradInput

    def __repr__(self):
        return "MSECriterion"

## 14. (0.2) Negative LogLikelihood criterion (numerically unstable)
You task is to implement the **ClassNLLCriterion**. It should implement [multiclass log loss](http://scikit-learn.org/stable/modules/model_evaluation.html#log-loss). Nevertheless there is a sum over `y` (target) in that formula,
remember that targets are one-hot encoded. This fact simplifies the computations a lot. Note, that criterions are the only places, where you divide by batch size. Also there is a small hack with adding small number to probabilities to avoid computing log(0).
- input:   **`batch_size x n_feats`** - probabilities
- target: **`batch_size x n_feats`** - one-hot representation of ground truth
- output: **scalar**



In [23]:
class ClassNLLCriterionUnstable(Criterion):
    EPS = 1e-15
    def __init__(self):
        a = super(ClassNLLCriterionUnstable, self)
        super(ClassNLLCriterionUnstable, self).__init__()
    def updateOutput(self, input, target):
        # Используем этот трюк, чтобы избежать численных ошибок
        input_clamp = np.clip(input, self.EPS, 1 - self.EPS)
        self.output = (-np.sum(target * np.log(input_clamp)) / input.shape[0])
        return self.output

    def updateGradInput(self, input, target):
        # Используем этот трюк, чтобы избежать численных ошибок
        input_clamp = np.clip(input, self.EPS, 1 - self.EPS)
        self.gradInput = (-target / (input_clamp * input.shape[0]))
        return self.gradInput

    def __repr__(self):
        return "ClassNLLCriterionUnstable"

## 15. (0.3) Negative LogLikelihood criterion (numerically stable)
- input:   **`batch_size x n_feats`** - log probabilities
- target: **`batch_size x n_feats`** - one-hot representation of ground truth
- output: **scalar**

Task is similar to the previous one, but now the criterion input is the output of log-softmax layer. This decomposition allows us to avoid problems with computation of forward and backward of log().

In [24]:
class ClassNLLCriterion(Criterion):
    def __init__(self):
        a = super(ClassNLLCriterion, self)
        super(ClassNLLCriterion, self).__init__()

    def updateOutput(self, input, target):
        self.output = -np.sum(target * input) / input.shape[0]
        # Your code goes here. ################################################
        return self.output

    def updateGradInput(self, input, target):
        self.gradInput = -target / input.shape[0]
        # Your code goes here. ################################################
        return self.gradInput

    def __repr__(self):
        return "ClassNLLCriterion"

1-я часть задания: реализация слоев, лосей и функций активации - 5 баллов. \\
2-я часть задания: реализация моделей на своих классах. Что должно быть:
  1. Выберите оптимизатор и реализуйте его, чтоб он работал с вами классами. - 1 балл.
  2. Модель для задачи мультирегрессии на выбраных вами данных. Использовать FCNN, dropout, batchnorm, MSE. Пробуйте различные фукнции активации. Для первой модели попробуйте большую, среднюю и маленькую модель. - 1 балл.
  3. Модель для задачи мультиклассификации на MNIST. Использовать свёртки, макспулы, флэттэны, софтмаксы - 1 балла.
  4. Автоэнкодер для выбранных вами данных. Должен быть на свёртках и полносвязных слоях, дропаутах, батчнормах и тд. - 2 балла. \\

Дополнительно в оценке каждой модели будет учитываться:
1. Наличие правильно выбранной метрики и лосс функции.
2. Отрисовка графиков лосей и метрик на трейне-валидации. Проверка качества модели на тесте.
3. Наличие шедулера для lr.
4. Наличие вормапа.
5. Наличие механизма ранней остановки и сохранение лучшей модели.
6. Свитч лося (метрики) и оптимайзера.

**2 ЧАСТЬ ДОМАШКИ, ПОПЫТКИ И ПОДЕЛКИ**




## Оптимизатор

In [100]:
class Adagrad:
    def __init__(self, parameters, learning_rate=0.001, epsilon=1e-8):
        self.parameters = parameters
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.grad_squared = [np.zeros_like(param) for param in self.parameters]

    def step(self):
        for i, param in enumerate(self.parameters):
            if param is not None:
                self.grad_squared[i] += param ** 2
                param -= self.learning_rate * param / (np.sqrt(self.grad_squared[i]) + self.epsilon)

    def zero_grad(self):
        for param in self.parameters:
            if param is not None:
                param.fill(0)
# Mean Squared Error before optimization: 3.885444712971214
# Updated weights: [[ 0.16187615 -0.02891488  0.22429851 -0.16696127 -0.30338922 -0.06241489
#   -0.1038853   0.18659887 -0.06429343 -0.12891891]
#  [ 0.03704954  0.29207653 -0.15081989 -0.23375312 -0.19634278  0.10465812
#    0.23115285 -0.11558665 -0.12893655 -0.29608999]]
# Updated biases: [ 0.03023535 -0.02040391]
# Mean Squared Error after optimization: 3.8194445855494394
# Gradients after zeroing: Linear 10 -> 2

In [134]:
class OptimizerAdam:
    def __init__(self, parameters, lr=0.001, b1=0.9, b2=0.999, eps=1e-8):
        self.lr = lr
        self.b1 = b1
        self.b2 = b2
        self.eps = eps
        self.parameter_data = []
        for i, param in enumerate(parameters):
            if param is not None:
                self.parameter_data.append({
                    'index': i,
                    'param': param,
                    'm': np.zeros_like(param),
                    'v': np.zeros_like(param)
                })
        self.t = 0
    def step(self, gradParameters):
        self.t += 1
        for data in self.parameter_data:
            index = data['index']
            param = data['param']
            m = data['m']
            v = data['v']
            grad = gradParameters[index]
            m = self.b1 * m + (1 - self.b1) * grad
            v = self.b2 * v + (1 - self.b2) * (grad ** 2)
            estim_m = m / (1 - self.b1 ** self.t)
            estim_v = v / (1 - self.b2 ** self.t)
            param -= self.lr * estim_m / (np.sqrt(estim_v) + self.eps)
            data['m'] = m
            data['v'] = v
            data['param'] = param

    def zero_grad(self):
        for data in self.parameter_data:
            data['m'] = np.zeros_like(data['param'])
            data['v'] = np.zeros_like(data['param'])