In [2]:
import torch 
from torch import optim

In [3]:
class Linear(object):
    
    def forward(x, w, b):
        """
        Computes the forward pass for a linear (fully-connected) layer.
        The input x has shape (N, d_1, ..., d_k) and contains a minibatch of N
        examples, where each example x[i] has shape (d_1, ..., d_k). We will
        reshape each input into a vector of dimension D = d_1 * ... * d_k, and
        then transform it to an output vector of dimension M.
        Inputs:
        - x: A tensor containing input data, of shape (N, d_1, ..., d_k)
        - w: A tensor of weights, of shape (D, M)
        - b: A tensor of biases, of shape (M,)
        Returns a tuple of:
        - out: output, of shape (N, M)
        - cache: (x, w, b)
        """
        # Reshape x into rows
        N = x.shape[0] # Number of examples
        x_row = x.reshape(N, -1) # Reshape x into rows of size D
        D = x_row.shape[1] # Dimension of input
        M = b.shape[0] # Dimension of output
    
        # Compute out
        out = x_row.mm(w) + b # Matrix multiplication and broadcasted addition
    
        # Store cache
        cache = (x, w, b)
    
        return out, cache
    
    def backward(dout, cache):
        """
        Computes the backward pass for a linear layer.
        Inputs:
        - dout: Upstream derivative, of shape (N, M)
        - cache: Tuple of:
          - x: Input data, of shape (N, d_1, ... d_k)
          - w: Weights, of shape (D, M)
          - b: Biases, of shape (M,)
    
        Returns a tuple of:
        - dx: Gradient with respect to x, of shape
          (N, d1, ..., d_k)
        - dw: Gradient with respect to w, of shape (D, M)
        - db: Gradient with respect to b, of shape (M,)
        """
        # Unpack cache
        x, w, b = cache
    
        # Reshape x into rows
        N = x.shape[0] # Number of examples
        x_row = x.reshape(N, -1) # Reshape x into rows of size D
        D = x_row.shape[1] # Dimension of input
        M = b.shape[0] # Dimension of output
    
        # Compute gradients
        dx = dout.mm(w.t()).reshape(x.shape) # Reshape back to original shape of x
        dw = x_row.t().mm(dout) # Matrix multiplication
        db = dout.sum(dim=0) # Sum over rows
    
        return dx, dw, db



In [4]:
class ReLU(object):

    def forward(x):
        """
        Computes the forward pass for a layer of rectified
        linear units (ReLUs).
    
        Input:
        - x: Input; a tensor of any shape
    
        Returns a tuple of:
        - out: Output, a tensor of the same shape as x
        - cache: x
        """
        out = torch.relu(x)
        cache = x
        return out, cache
    
    def backward(dout, cache):
        """
        Computes the backward pass for a layer of rectified
        linear units (ReLUs).

        Input:
        - dout: Upstream derivatives, of any shape
        - cache: Input x, of same shape as dout

        Returns:
        - dx: Gradient with respect to x
        """
        dx = dout * (cache > 0)
        return dx

In [5]:
class Linear_ReLU(object):
    
    def forward(x, w, b):
        # Convenience layer that performs an linear transform
        # followed by a ReLU.
        # Inputs:
        # - x: Input to the linear layer
        # - w, b: Weights for the linear layer
        # Returns a tuple of:
        # - out: Output from the ReLU
        # - cache: Object to give to the backward pass (hint: cache = (fc_cache, relu_cache))

        # Reshape x to a matrix of shape (2, 12)
        x = x.view(x.shape[0], -1)

        # Compute the linear transform
        out = x.mm(w) + b

        # Save the cache for the backward pass
        fc_cache = (x, w, b)

        # Apply the ReLU activation
        out = torch.relu(out)

        # Save the cache for the backward pass
        relu_cache = out

        # Return the output and the cache
        cache = (fc_cache, relu_cache)
        return out, cache



    def backward(dout, cache):
        # Backward pass for the linear-relu convenience layer
        # Inputs:
        # - dout: Upstream derivatives
        # - cache: Tuple of (fc_cache, relu_cache) from forward pass
        # Returns a tuple of:
        # - dx: Gradient with respect to x
        # - dw: Gradient with respect to w
        # - db: Gradient with respect to b

        # Unpack the cache
        fc_cache, relu_cache = cache

        # Compute the gradient of the ReLU activation
        dout = dout * (relu_cache > 0)

        # Unpack the fc_cache
        x, w, b = fc_cache

        # Compute the gradient of the linear transform
        dx = dout.mm(w.t())
        dw = x.t().mm(dout)
        db = dout.sum(dim=0)
        
        dx = dx.view([2, -1, 4])

        # Return the gradients
        return dx, dw, db
    
    def softmax_loss(x, y):
        """
        Computes the loss and gradient for softmax classification.
        Inputs:
        - x: Input data, of shape (N, C) where x[i, j] is the score for
          the jth class for the ith input.
        - y: Vector of labels, of shape (N,) where y[i] is the label
          for x[i] and 0 <= y[i] < C
        Returns a tuple of:
        - loss: Scalar giving the loss
        - dx: Gradient of the loss with respect to x
        """
        # Get the number of samples
        N = x.shape[0]

        # Compute the softmax scores
        scores = torch.exp(x - x.max(dim=1, keepdim=True)[0])
        probs = scores / scores.sum(dim=1, keepdim=True)

        # Compute the cross-entropy loss
        loss = -torch.log(probs[torch.arange(N), y]).mean()

        # Compute the gradient of the loss with respect to x
        dx = probs.clone()
        dx[torch.arange(N), y] -= 1
        dx /= N

        # Return the loss and gradient
        return loss, dx




In [6]:
 def softmax_loss(x, y):
        """
        Computes the loss and gradient for softmax classification.
        Inputs:
        - x: Input data, of shape (N, C) where x[i, j] is the score for
          the jth class for the ith input.
        - y: Vector of labels, of shape (N,) where y[i] is the label
          for x[i] and 0 <= y[i] < C
        Returns a tuple of:
        - loss: Scalar giving the loss
        - dx: Gradient of the loss with respect to x
        """
        # Get the number of samples
        N = x.shape[0]

        # Compute the softmax scores
        scores = torch.exp(x - x.max(dim=1, keepdim=True)[0])
        probs = scores / scores.sum(dim=1, keepdim=True)

        # Compute the cross-entropy loss
        loss = -torch.log(probs[torch.arange(N), y]).mean()

        # Compute the gradient of the loss with respect to x
        dx = probs.clone()
        dx[torch.arange(N), y] -= 1
        dx /= N

        # Return the loss and gradient
        return loss, dx

In [7]:
def linear_backward(dscores, h, W2, b2):
    # Compute the gradient of the loss with respect to the input of the linear layer
    dh = dscores.mm(W2.t())
    # Compute the gradient of the loss with respect to the weight matrix of the linear layer
    dW2 = h.t().mm(dscores)
    # Compute the gradient of the loss with respect to the bias vector of the linear layer
    db2 = dscores.sum(dim=0)
    # Return the gradients
    return dh, dW2, db2


In [8]:
def relu_backward(dout, cache):
    dx = dout * (cache > 0)
    return dx

In [9]:
import torch
class TwoLayerNet(object):

    def __init__(self, input_dim=3*32*32, hidden_dim=100, num_classes=10,weight_scale=1e-3, reg=0.0,dtype=torch.float32,
                device='cpu'):
        self.params = {}
        self.reg = reg
        # Initialize the first layer weights and biases
        self.params['W1'] = torch.randn(input_dim, hidden_dim, dtype=dtype, device=device) * weight_scale
        self.params['b1'] = torch.zeros(hidden_dim, dtype=dtype, device=device)
        # Initialize the second layer weights and biases
        self.params['W2'] = torch.randn(hidden_dim, num_classes, dtype=dtype, device=device) * weight_scale
        self.params['b2'] = torch.zeros(num_classes, dtype=dtype, device=device)

    def save(self, path):
        checkpoint = {
          'reg': self.reg,
          'params': self.params,
        }
        torch.save(checkpoint, path)
        print("Saved in {}".format(path))
        
    def load(self, path, dtype, device):
        checkpoint = torch.load(path, map_location='device')
        self.params = checkpoint['params']
        self.reg = checkpoint['reg']
        for p in self.params:
            self.params[p] = self.params[p].type(dtype).to(device)
        print("load checkpoint file: {}".format(path))
        
    def loss(self, X, y=None):

        scores = None

        # Unpack the parameters
        W1, b1 = self.params['W1'], self.params['b1']
        W2, b2 = self.params['W2'], self.params['b2']

        # Forward pass
        # Compute the hidden layer
        h = X.mm(W1) + b1
        # Apply the ReLU activation
        h = torch.relu(h)
        # Compute the output layer
        scores = h.mm(W2) + b2

        # If y is None, we are in test mode, so just return scores
        if y is None:
            return scores
    
        loss, grads = 0, {}

        # Compute the softmax loss
        loss, dscores = softmax_loss(scores, y)

        # Add the L2 regularization term
        reg = self.reg
        loss += reg * (torch.sum(W1 ** 2) + torch.sum(W2 ** 2))

        # Backward pass
        grads = {}
        # Compute the gradient of the output layer
        dh, dW2, db2 = linear_backward(dscores, h, W2, b2)
        # Add the L2 regularization term
        dW2 += 2 * reg * W2
        # Compute the gradient of the ReLU activation
        dh = relu_backward(dh, h)
        # Compute the gradient of the hidden layer
        dx, dW1, db1 = linear_backward(dh, X, W1, b1)
        # Add the L2 regularization term
        dW1 += 2 * reg * W1

        # Store the gradients in the dictionary
        grads['W1'] = dW1
        grads['b1'] = db1
        grads['W2'] = dW2
        grads['b2'] = db2

        return loss, grads

# ##################################################################

In [10]:
def affine_relu_forward(x, w, b):
    """Convenience layer that performs an affine transform followed by a ReLU.

    Inputs:
    - x: Input to the affine layer
    - w, b: Weights for the affine layer

    Returns a tuple of:
    - out: Output from the ReLU
    - cache: Object to give to the backward pass
    """
    a, fc_cache = affine_forward(x, w, b)
    out, relu_cache = relu_forward(a)
    cache = (fc_cache, relu_cache)
    return out, cache

In [11]:
def affine_relu_backward(dout, cache):
    """Backward pass for the affine-relu convenience layer.
    """
    fc_cache, relu_cache = cache
    da = relu_backward(dout, relu_cache)
    dx, dw, db = affine_backward(da, fc_cache)
    return dx, dw, db

In [12]:
import torch
def affine_forward(x, w, b):

    out = None
    N = x.shape[0] # Number of examples
    x_row = torch.reshape(x, (N, -1)) # Reshape x into rows of size D
    out = x_row.matmul(w) + b
    cache = (x, w, b)
    return out, cache

In [13]:
import torch
def affine_backward(dout, cache):

    x, w, b = cache
    dx, dw, db = None, None, None
    pass
    dx = dout.matmul(w.T)
    dx = torch.reshape(dx, (dx.shape[0], *x.shape[1:]))
    t = torch.reshape(x, (x.shape[0], -1))
    dw = t.T.matmul(dout)
    db = torch.sum(dout, dim=0) # Sum over rows
    return dx, dw, db


In [14]:
import torch
def relu_forward(x):
    out = None
    pass
    out = torch.where(x > 0, x, 0)
    cache = x
    return out, cache


In [15]:
import torch
def relu_backwardd(dout, cache):
    
    dx, x = None, cache
    pass
    dx = torch.where(x > 0, dout, 0)
    return dx

In [16]:
import torch
def batchnorm_forward(x, gamma, beta, bn_param):
    """Forward pass for batch normalization.

    During training the sample mean and (uncorrected) sample variance are
    computed from minibatch statistics and used to normalize the incoming data.
    During training we also keep an exponentially decaying running mean of the
    mean and variance of each feature, and these averages are used to normalize
    data at test-time.

    At each timestep we update the running averages for mean and variance using
    an exponential decay based on the momentum parameter:

    running_mean = momentum * running_mean + (1 - momentum) * sample_mean
    running_var = momentum * running_var + (1 - momentum) * sample_var

    Note that the batch normalization paper suggests a different test-time
    behavior: they compute sample mean and variance for each feature using a
    large number of training images rather than using a running average. For
    this implementation we have chosen to use running averages instead since
    they do not require an additional estimation step; the torch7
    implementation of batch normalization also uses running averages.

    Input:
    - x: Data of shape (N, D)
    - gamma: Scale parameter of shape (D,)
    - beta: Shift paremeter of shape (D,)
    - bn_param: Dictionary with the following keys:
      - mode: 'train' or 'test'; required
      - eps: Constant for numeric stability
      - momentum: Constant for running mean / variance.
      - running_mean: Array of shape (D,) giving running mean of features
      - running_var Array of shape (D,) giving running variance of features

    Returns a tuple of:
    - out: of shape (N, D)
    - cache: A tuple of values needed in the backward pass
    """
    mode = bn_param["mode"]
    eps = bn_param.get("eps", 1e-5)
    momentum = bn_param.get("momentum", 0.9)

    N, D = x.shape 
    running_mean = bn_param.get("running_mean", torch.zeros(D, dtype=x.dtype)) 
    running_var = bn_param.get("running_var", torch.zeros(D, dtype=x.dtype))
   
    out, cache = None, None
    if mode == "train":


        pass
        cache = {}
        sample_mean = torch.mean(x, dim=0, keepdim=True) 
        sample_var = torch.var(x, dim=0, keepdim=True)
        x_hat = (x-sample_mean) / torch.sqrt(sample_var + eps)
        out = gamma * x_hat + beta

        running_mean = momentum * running_mean + (1-momentum) * sample_mean
        running_var = momentum * running_var + (1-momentum) * sample_var
        cache['x_hat'] = x_hat
        cache['eps'] = eps
        cache['var'] = sample_var
        cache['x'] = x
        cache['mean'] = sample_mean
        cache['gamma'] = gamma

        
    elif mode == "test":
        

        pass
        x_hat = (x-running_mean) / torch.sqrt(running_var+eps)
        out = gamma * x_hat + beta

        
    else:
        raise ValueError('Invalid forward batchnorm mode "%s"' % mode)

    # Store the updated running means back into bn_param
    bn_param["running_mean"] = running_mean
    bn_param["running_var"] = running_var

    return out, cache



In [17]:
import torch
def batchnorm_backward(dout, cache):
    """Backward pass for batch normalization.

    For this implementation, you should write out a computation graph for
    batch normalization on paper and propagate gradients backward through
    intermediate nodes.

    Inputs:
    - dout: Upstream derivatives, of shape (N, D)
    - cache: Variable of intermediates from batchnorm_forward.

    Returns a tuple of:
    - dx: Gradient with respect to inputs x, of shape (N, D)
    - dgamma: Gradient with respect to scale parameter gamma, of shape (D,)
    - dbeta: Gradient with respect to shift parameter beta, of shape (D,)
    """
    dx, dgamma, dbeta = None, None, None
    
  
    pass
    N = cache['x'].shape[0]

    dgamma = torch.sum(cache['x_hat'] * dout, dim=0)
    dbeta = torch.sum(dout, dim=0)

    dx_hat = torch.tensor([cache['gamma']]) * dout
    dtmp2 = dx_hat * torch.pow(cache['var'] + cache['eps'], -0.5)
    dtmp3 = dx_hat * (cache['x'] - cache['mean'])
    dx = dtmp2
    dmean = torch.sum(-dtmp2, dim=0)
    dvar = torch.sum(dtmp3 * (-0.5) * torch.pow(cache['var'] + cache['eps'], -1.5), dim=0)
    dx += (cache['x'] - cache['mean']) * 2 / N * dvar
    dmean += torch.sum((cache['mean'] - cache['x']) * 2 / N * dvar, dim=0)
    dx += dmean / N

    

    return dx, dgamma, dbeta


In [18]:
import torch
def layernorm_forward(x, gamma, beta, ln_param):
    """Forward pass for layer normalization.

    During both training and test-time, the incoming data is normalized per data-point,
    before being scaled by gamma and beta parameters identical to that of batch normalization.

    Note that in contrast to batch normalization, the behavior during train and test-time for
    layer normalization are identical, and we do not need to keep track of running averages
    of any sort.

    Input:
    - x: Data of shape (N, D)
    - gamma: Scale parameter of shape (D,)
    - beta: Shift paremeter of shape (D,)
    - ln_param: Dictionary with the following keys:
        - eps: Constant for numeric stability

    Returns a tuple of:
    - out: of shape (N, D)
    - cache: A tuple of values needed in the backward pass
    """
    out, cache = None, None
    eps = ln_param.get("eps", 1e-5)
    

    pass
    cache = {}
    xt = x.T
    N = xt.shape[0]
    mean = torch.sum(xt, axis=0) / N
    var = torch.sum((xt - mean) * (xt - mean), axis=0) / N
    sqrt = torch.sqrt(var + eps)
    x_hat = ((xt - mean) / sqrt).T
    out = x_hat * gamma + beta

    cache['x_hat'] = x_hat
    cache['sqrt'] = sqrt
    cache['gamma'] = gamma

    
    return out, cache

In [19]:
import torch
def layernorm_backward(dout, cache):
    """Backward pass for layer normalization.

    For this implementation, you can heavily rely on the work you've done already
    for batch normalization.

    Inputs:
    - dout: Upstream derivatives, of shape (N, D)
    - cache: Variable of intermediates from layernorm_forward.

    Returns a tuple of:
    - dx: Gradient with respect to inputs x, of shape (N, D)
    - dgamma: Gradient with respect to scale parameter gamma, of shape (D,)
    - dbeta: Gradient with respect to shift parameter beta, of shape (D,)
    """
    dx, dgamma, dbeta = None, None, None
    

    pass
    N = dout.shape[1]
    dbeta = torch.sum(dout, axis=0)
    dgamma = torch.sum(cache['x_hat'] * dout, axis=0)
    dx_hat = (torch.tensor([cache['gamma']]) * dout).T
    dx = -cache['x_hat'].T / cache['sqrt'] / N
    dx = dx * torch.sum(dx_hat * cache['x_hat'].T, axis=0)
    dx -= torch.sum(dx_hat, axis=0) / cache['sqrt'] / N
    dx += dx_hat / cache['sqrt']
    dx = dx.T

    
    return dx, dgamma, dbeta

In [2]:
import torch
def dropout_forward(x, dropout_param):  
    p, mode = dropout_param["p"], dropout_param["mode"]
    if "seed" in dropout_param:
        torch.manual_seed(dropout_param["seed"])  
    mask = None
    out = None
    if mode == "train":            
        pass
        mask = torch.bernoulli(torch.ones(*x.shape) * (1-p)) / (1-p)
        out = mask * x      
    elif mode == "test":
        pass
        out = x
    cache = (dropout_param, mask)
    out = out.to(x.dtype)
    return out, cache

In [4]:
import torch
def dropout_backward(dout, cache):   
    dropout_param, mask = cache
    mode = dropout_param["mode"]
    dx = None
    if mode == "train":        
        pass
        dx = dout * mask
    elif mode == "test":
        dx = dout
    return dx

In [22]:
import torch
def softmax_loss2(x, y):
    """Computes the loss and gradient for softmax classification.

    Inputs:
    - x: Input data, of shape (N, C) where x[i, j] is the score for the jth
      class for the ith input.
    - y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and
      0 <= y[i] < C

    Returns a tuple of:
    - loss: Scalar giving the loss
    - dx: Gradient of the loss with respect to x
    """
    loss, dx = None, None

    


    pass
    eps = 1e-7
    num_x = x.shape[0]
    exp_score = torch.exp(x)
    norm_score = exp_score/torch.sum(exp_score,axis=1).reshape(num_x,-1)
    loss = torch.sum(-1*torch.log(norm_score[range(num_x),y]+eps)) / num_x
    
    dx = norm_score.clone()
    dx[range(num_x),y] -= 1
    dx = dx / num_x

    
    return loss, dx


In [23]:
import torch
class FullyConnectedNet(object):
    """Class for a multi-layer fully connected neural network.

    Network contains an arbitrary number of hidden layers, ReLU nonlinearities,
    and a softmax loss function. This will also implement dropout and batch/layer
    normalization as options. For a network with L layers, the architecture will be

    {affine - [batch/layer norm] - relu - [dropout]} x (L - 1) - affine - softmax

    where batch/layer normalization and dropout are optional and the {...} block is
    repeated L - 1 times.

    Learnable parameters are stored in the self.params dictionary and will be learned
    using the Solver class.
    """

    def __init__(
        self,
        hidden_dims,
        input_dim=3 * 32 * 32,
        num_classes=10,
        dropout_keep_ratio=1,
        normalization=None,
        reg=0.0,
        weight_scale=1e-2,
        dtype=torch.float,
        seed=None,
    ):
        """Initialize a new FullyConnectedNet.

        Inputs:
        - hidden_dims: A list of integers giving the size of each hidden layer.
        - input_dim: An integer giving the size of the input.
        - num_classes: An integer giving the number of classes to classify.
        - dropout_keep_ratio: Scalar between 0 and 1 giving dropout strength.
            If dropout_keep_ratio=1 then the network should not use dropout at all.
        - normalization: What type of normalization the network should use. Valid values
            are "batchnorm", "layernorm", or None for no normalization (the default).
        - reg: Scalar giving L2 regularization strength.
        - weight_scale: Scalar giving the standard deviation for random
            initialization of the weights.
        - dtype: A numpy datatype object; all computations will be performed using
            this datatype. float32 is faster but less accurate, so you should use
            float64 for numeric gradient checking.
        - seed: If not None, then pass this random seed to the dropout layers.
            This will make the dropout layers deteriminstic so we can gradient check the model.
        """
        self.normalization = normalization
        self.use_dropout = dropout_keep_ratio != 1
        self.reg = reg
        self.num_layers = 1 + len(hidden_dims)
        self.dtype = dtype
        self.params = {}

        


  

        pass
        self.params['W1'] = torch.normal(mean=0, std=weight_scale, size=(input_dim,hidden_dims[0]))
        self.params['b1'] = torch.zeros((hidden_dims[0],))
        for i in range(1,self.num_layers):
            if i == self.num_layers-1:
                self.params['W'+str(i+1)] = torch.normal(mean=0, std=weight_scale, size=(hidden_dims[i-1],num_classes))
                self.params['b'+str(i+1)] = torch.zeros((num_classes,))
            else:
                self.params['W'+str(i+1)] = torch.normal(mean=0, std=weight_scale, size=(hidden_dims[i-1],hidden_dims[i]))
                self.params['b'+str(i+1)] = torch.zeros((hidden_dims[i],))

        if self.normalization == 'batchnorm' or self.normalization == 'layernorm':
            for j in range(1,self.num_layers):
                self.params['gamma'+str(j)] = torch.ones(hidden_dims[j-1])
                self.params['beta'+str(j)] = torch.zeros(hidden_dims[j-1])

        

        # When using dropout we need to pass a dropout_param dictionary to each
        # dropout layer so that the layer knows the dropout probability and the mode
        # (train / test). You can pass the same dropout_param to each dropout layer.
        self.dropout_param = {}
        if self.use_dropout:
            self.dropout_param = {"mode": "train", "p": dropout_keep_ratio}
            if seed is not None:
                self.dropout_param["seed"] = seed

        # With batch normalization we need to keep track of running means and
        # variances, so we need to pass a special bn_param object to each batch
        # normalization layer. You should pass self.bn_params[0] to the forward pass
        # of the first batch normalization layer, self.bn_params[1] to the forward
        # pass of the second batch normalization layer, etc.
        self.bn_params = []
        if self.normalization == "batchnorm":
            self.bn_params = [{"mode": "train"} for i in range(self.num_layers - 1)]
        if self.normalization == "layernorm":
            self.bn_params = [{} for i in range(self.num_layers - 1)]

        # Cast all parameters to the correct datatype.
        for k, v in self.params.items():
            self.params[k] = v.to(dtype)
        
    def save(self, path):
        checkpoint = {
          'reg': self.reg,
          'dtype': self.dtype,
          'params': self.params,
          'num_layers': self.num_layers,
          'use_dropout': self.use_dropout,
          'dropout_param': self.dropout_param,
        }

        torch.save(checkpoint, path)
        print("Saved in {}".format(path))
        
    def load(self, path, dtype, device):
        checkpoint = torch.load(path, map_location='cpu')
        self.params = checkpoint['params']
        self.dtype = dtype
        self.reg = checkpoint['reg']
        self.num_layers = checkpoint['num_layers']
        self.use_dropout = checkpoint['use_dropout']
        self.dropout_param = checkpoint['dropout_param']

        for p in self.params:
            self.params[p] = self.params[p].type(dtype).to(device)

        print("load checkpoint file: {}".format(path))
        
    def loss(self, X, y=None):
        """Compute loss and gradient for the fully connected net.
        
        Inputs:
        - X: Array of input data of shape (N, d_1, ..., d_k)
        - y: Array of labels, of shape (N,). y[i] gives the label for X[i].

        Returns:
        If y is None, then run a test-time forward pass of the model and return:
        - scores: Array of shape (N, C) giving classification scores, where
            scores[i, c] is the classification score for X[i] and class c.

        If y is not None, then run a training-time forward and backward pass and
        return a tuple of:
        - loss: Scalar value giving the loss
        - grads: Dictionary with the same keys as self.params, mapping parameter
            names to gradients of the loss with respect to those parameters.
        """
        
        mode = "test" if y is None else "train"

        # Set train/test mode for batchnorm params and dropout param since they
        # behave differently during training and testing.
        if self.use_dropout:
            self.dropout_param["mode"] = mode
        if self.normalization == "batchnorm":
            for bn_param in self.bn_params:
                bn_param["mode"] = mode
        scores = None
        

        pass
        scores = X
        Cache = {}
        for i in range(self.num_layers):
            scores, Cache['af_'+str(i+1)] = affine_forward(scores,self.params['W'+str(i+1)],self.params['b'+str(i+1)])
            if i != self.num_layers-1:
                if self.normalization == 'batchnorm':
                    scores, Cache['bn_'+str(i+1)] = batchnorm_forward(scores,
                                                                        self.params['gamma'+str(i+1)],
                                                                        self.params['beta'+str(i+1)],
                                                                        self.bn_params[i])
                elif self.normalization == 'layernorm':
                    scores, Cache['ln_'+str(i+1)] = layernorm_forward(scores,
                                                                        self.params['gamma'+str(i+1)],
                                                                        self.params['beta'+str(i+1)],
                                                                        self.bn_params[i])
                scores, Cache['rl_'+str(i+1)] = relu_forward(scores)  # relu
                if self.use_dropout:
                    scores, Cache['do_'+str(i+1)] = dropout_forward(scores, self.dropout_param)
        

        

        # If test mode return early.
        if mode == "test":
            return scores

        loss, grads = 0.0, {}
        

        pass
        loss,dx = softmax_loss2(scores, y)
        for i in range(self.num_layers):
            loss += 0.5 * self.reg * torch.sum(torch.pow(self.params['W'+str(i+1)],2))  # never forget the regularization
        
        for j in reversed(range(self.num_layers)):
            if j != self.num_layers-1:
                if self.use_dropout:
                    dx = dropout_backward(dx,Cache['do_'+str(j+1)])
                dx = relu_backwardd(dx,Cache['rl_'+str(j+1)])
                if self.normalization == 'batchnorm':
                    dx, grads['gamma'+str(j+1)],grads['beta'+str(j+1)] = batchnorm_backward(dx,Cache['bn_'+str(j+1)])
                if self.normalization == 'layernorm':
                    dx, grads['gamma'+str(j+1)],grads['beta'+str(j+1)] = layernorm_backward(dx,Cache['ln_'+str(j+1)])
            dx,grads['W'+str(j+1)],grads['b'+str(j+1)] = affine_backward(dx,Cache['af_'+str(j+1)])
            grads['W'+str(j+1)] += self.reg * self.params['W'+str(j+1)]
        

        

        return loss, grads

In [24]:
from importnb import imports
import importlib
with imports("ipynb"):
    solver = importlib.import_module("solver")

In [25]:
from solver import Solver

In [26]:
def create_solver_instance(data_dict, dtype, device):
    model = TwoLayerNet(hidden_dim=200, dtype=dtype, device=device)

    solver = None

    solver = Solver(model, data_dict)

    return solver


In [27]:
    def get_three_layer_network_params():
    
        weight_scale = 2e-2   # Experiment with this!
        learning_rate = 1e-2  # Experiment with this!

        return weight_scale, learning_rate

    def get_five_layer_network_params():
    
        learning_rate = 2e-2  # Experiment with this!
        weight_scale = 5e-2   # Experiment with this!

        return weight_scale, learning_rate
    

In [28]:
def sgd(w, dw, config=None):
    """
    Performs vanilla stochastic gradient descent.

    config format:
    - learning_rate: Scalar learning rate.
    """
    if config is None: config = {}
    config.setdefault('learning_rate', 1e-1)

    w -= config['learning_rate'] * dw
    return w, config

In [2]:
import torch
def sgd_momentum(w, dw, config=None):
    """
    Performs stochastic gradient descent with momentum.

    config format:
    - learning_rate: Scalar learning rate.
    - momentum: Scalar between 0 and 1 giving the momentum value.
      Setting momentum = 0 reduces to sgd.
    - velocity: A numpy array of the same shape as w and dw used to store a
      moving average of the gradients.
    """
    if config is None: config = {}
    config.setdefault('learning_rate', 1e-2)
    config.setdefault('momentum', 0.9)
    v = config.get('velocity', torch.zeros_like(w))

    next_w = None
    

    v = config['momentum'] * v - config['learning_rate'] * dw
    next_w = w + v

    
    config['velocity'] = v

    return next_w, config


In [3]:
def rmsprop(w, dw, config=None):
    """
    Uses the RMSProp update rule, which uses a moving average of squared
    gradient values to set adaptive per-parameter learning rates.

    config format:
    - learning_rate: Scalar learning rate.
    - decay_rate: Scalar between 0 and 1 giving the decay rate for the squared
      gradient cache.
    - epsilon: Small scalar used for smoothing to avoid dividing by zero.
    - cache: Moving average of second moments of gradients.
    """
    if config is None: config = {}
    config.setdefault('learning_rate', 1e-2)
    config.setdefault('decay_rate', 0.99)
    config.setdefault('epsilon', 1e-8)
    config.setdefault('cache', torch.zeros_like(w))

    next_w = None
    

    config['cache'] = config['decay_rate'] * config['cache'] + (1 - config['decay_rate']) * dw ** 2
    next_w = w - config['learning_rate'] * dw / (torch.sqrt(config['cache']) + config['epsilon'])

   

    return next_w, config


In [4]:
import torch
def adam(w, dw, config=None):
    """
    Uses the Adam update rule, which incorporates moving averages of both the
    gradient and its square and a bias correction term.

    config format:
    - learning_rate: Scalar learning rate.
    - beta1: Decay rate for moving average of first moment of gradient.
    - beta2: Decay rate for moving average of second moment of gradient.
    - epsilon: Small scalar used for smoothing to avoid dividing by zero.
    - m: Moving average of gradient.
    - v: Moving average of squared gradient.
    - t: Iteration number.
    """
    if config is None: config = {}
    config.setdefault('learning_rate', 1e-3)
    config.setdefault('beta1', 0.9)
    config.setdefault('beta2', 0.999)
    config.setdefault('epsilon', 1e-8)
    config.setdefault('m', torch.zeros_like(w))
    config.setdefault('v', torch.zeros_like(w))
    config.setdefault('t', 0)

    next_w = None
    

    config['m'] = config['beta1'] * config['m'] + (1 - config['beta1']) * dw
    config['v'] = config['beta2'] * config['v'] + (1 - config['beta2']) * (dw ** 2)
    next_w = w - config['learning_rate'] * config['m'] / (torch.sqrt(config['v']) + config['epsilon'])


    return next_w, config

In [6]:
import torch
class Dropout(object):
    def forward(x, dropout_param):  
        p, mode = dropout_param["p"], dropout_param["mode"]
        if "seed" in dropout_param:
            torch.manual_seed(dropout_param["seed"])  
        mask = None
        out = None
        if mode == "train":            
            pass
            mask = torch.bernoulli(torch.ones(*x.shape) * (1-p)) / (1-p)
            out = mask * x      
        elif mode == "test":
            pass
            out = x
        cache = (dropout_param, mask)
        out = out.to(x.dtype)
        return out, cache

    def backward(dout, cache):   
        dropout_param, mask = cache
        mode = dropout_param["mode"]
        dx = None
        if mode == "train":        
            pass
            dx = dout * mask
        elif mode == "test":
            dx = dout
        return dx