# Tarea 3: Regularización y Optimización <br/> CC6204 Deep Learning, Universidad de Chile  <br/> Hoja de respuestas
## Nombre: Diego Irarrazaval

**Fecha de entrega: 13 de noviembre de 2020**


In [1]:
# Este notebook está pensado para correr en CoLaboratory. 
# Lo único imprescindible por importar es torch
import torch

# Posiblemenete quieras instalar e importar ipdb para debuggear.
# Si es así, descomenta lo siguiente:
# !pip install -q ipdb
# import ipdb

# Aqui instalamos la libreria de correccion del curso
!pip install -U "git+https://github.com/dccuchile/CC6204.git@master#egg=cc6204&subdirectory=autocorrect"
from timeit import default_timer as timer

Collecting cc6204
  Cloning https://github.com/dccuchile/CC6204.git (to revision master) to /tmp/pip-install-2toovxlp/cc6204
  Running command git clone -q https://github.com/dccuchile/CC6204.git /tmp/pip-install-2toovxlp/cc6204
Building wheels for collected packages: cc6204
  Building wheel for cc6204 (setup.py) ... [?25l[?25hdone
  Created wheel for cc6204: filename=cc6204-0.5.0-cp36-none-any.whl size=5801 sha256=8c934e15707fc942025fd8a8c44dfbe4ff51dbc3d9396f868c28cbf440f5f626
  Stored in directory: /tmp/pip-ephem-wheel-cache-i5z63xeg/wheels/62/f0/30/aadcb7ce24a2f9c935890518e902d4e23bf97b80f47bb64414
Successfully built cc6204
Installing collected packages: cc6204
Successfully installed cc6204-0.5.0


In [2]:
# importamos las herramientas del curso
from cc6204 import AutoCorrect, FailedTest

# ingresa el host y port que posteamos en u-cursos
corrector = AutoCorrect(host="cc6204.dcc.uchile.cl", port=443)

# anota el token que te daremos en u-cursos
token = "]ye/Ox;nsz"

Connection stablished


# Parte 1: Regularización y Generalización



## 1a) Regularización por *weight decay*

In [3]:
# Tu código debiera continuar así

class SGD():
  def __init__(self, parameters, lr, beta=0):
    # lo que sea necesario inicializar
    self.params = [p for p in parameters if p is not None]
    self.lr = lr
    self.beta = beta

  def step(self):
    # actualiza acá los parámetros a partir de los gradientes
    # y considera el nuevo valor beta
    for p in self.params:
      p.data = (1 - self.beta)*p.data - self.lr*p.grad

In [53]:
# Tests del API del curso
weight, grad = corrector.get_test_data(homework=3, question="1a", test=1, token=token)

weight = torch.tensor(weight, requires_grad=True)
weight.grad = torch.tensor(grad)

optimizer = SGD([weight], lr=0.1, beta=0.1)
optimizer.step()

# Submit
corrector.submit(homework=3, question="1a", test=1, token=token, answer=weight)

Using cached test data


TypeError: ignored

## 1b) Regularización por dropout

Para esta parte de la tarea, va a ser necesario modificar el método `forward` para que entregue el valor a la salida de la i-esima capa escondida. Para esto se modifica el método forward para que reciba un parámetro `output_layer` que indica luego de que capa escondida se espera el output.

In [5]:
# Los gradientes se pueden implementar como parte de cada una de las funciones de activación
# En el caso de swish y celu devuelven una tupla (d_dx, d_dp)

def sig(T, gradient=False):
  if gradient:
    sigT = sig(T)
    return sigT * (1 - sigT)
  return torch.reciprocal(1 + torch.exp(-1 * T))
  
def tanh(T, gradient=False):
  if gradient:
    tanhT = tanh(T)
    return 1 - tanhT * tanhT
  E = torch.exp(T)
  e = torch.exp(-1 * T)
  return (E - e) * torch.reciprocal(E + e)

def relu(T, gradient=False):
  if gradient:
    outT = torch.zeros_like(T)
    outT[T>=0] = 1
    return outT
  return torch.max(T, torch.zeros_like(T))

def swish(T, beta=1, gradient=False):
  if gradient:
    sigbT = sig(beta * T)
    swishT = T * sigbT
    return sigbT + beta * swishT * (1 - sigbT), swishT * (T - swishT)
  return T * torch.reciprocal(1 + torch.exp(-beta * T))

def celu(T, alpha=1, gradient=False):
  if alpha == 0:
    raise ValueError("alpha cannot be 0")

  zeros = torch.zeros_like(T)
  Talpha = T / alpha
  
  if gradient:
    e = Talpha.exp()
    d_dx = torch.ones_like(T)
    d_dx[T<0] = e[T<0]
    zeros[T<0] = (celu(T)[T<0] - T[T<0] * e[T<0]) / alpha
    return d_dx, zeros # d_dx, d_da
  
  return torch.max(zeros, T) + torch.min(zeros, alpha * (Talpha).expm1())

def softmax(T, dim, estable=True):
  if estable:
    T -= T.max(dim=dim, keepdim=True)[0]  # keepdim=True => output has dim with size 1. Otherwise, dim is squeezed
  exp = torch.exp(T)
  return exp / torch.sum(exp, dim=dim, keepdim=True)  # keepdim=True => output has dim with size 1. Otherwise, dim is squeezed

In [6]:
# Tu código debiera continuar como sigue
import torch.nn as nn
from torch.nn import Parameter, ParameterList
import math

def get_init_weights(shape):
  W = torch.randn(shape)
  # stdv = 1. / math.sqrt(W.size(1))
  # W.data.uniform_(-stdv, stdv)
  return Parameter(W)


class FFNN(nn.Module):
  def __init__(self, F, l_h, l_a, C, keep_prob=None, l_a_params=None):
    super(FFNN, self).__init__()    
  
    sizes = [F] + l_h + [C]
    self.Ws = ParameterList([get_init_weights((sizes[i], sizes[i+1])) for i in range(len(sizes)-1)])
    self.bs = ParameterList([Parameter(torch.zeros(h)) for h in sizes[1:]])
    self.fs = l_a
    if l_a_params is not None:
      self.fs_ps_mask = [Parameter(torch.tensor(p)) if p else None for p in l_a_params]
    else:
      self.fs_ps_mask = [None for _ in l_a]
    self.fs_ps = ParameterList([p for p in self.fs_ps_mask if p])
    self.keep_prob = keep_prob    
  
    self.drop_masks = [torch.zeros_like(w) for w in self.Ws]

  @property
  def in_size(self):
    return self.Ws[0].shape[0]
    
  def load_weights(self, Ws, U, bs, c):
    self.Ws = ParameterList([Parameter(W) for W in Ws + [U]])
    self.bs = ParameterList([Parameter(b) for b in bs + [c]])
  
  def resumen(self):
    # Usa self.parameters() o self.named_parameters().
    for name, p in self.named_parameters():
      print('{}:\t{}'.format(name, p.size()))
    pass

  def forward(self, x, predict=False):
    if predict:
      self.cacheU = [] #, self.cacheH = [], []
      for W, b, f, p in zip(self.Ws[:-1], self.bs[:-1], self.fs, self.fs_ps_mask):
        x = torch.mm(x, W) + b
        self.cacheU.append(x)
        x = f(x, p.item()) if p else f(x)
  #       self.cacheH.append(x)
      return softmax(torch.mm(x, self.Ws[-1]) + self.bs[-1], dim=1)
    
    else:
      self.cacheU = [] #, self.cacheH = [], []
      i = 0
      for W, b, f, p, keep_prob in zip(self.Ws[:-1], self.bs[:-1], self.fs, self.fs_ps_mask, self.keep_prob):
        x = torch.mm(x, W) + b
        self.cacheU.append(x)
        x = f(x, p.item()) if p else f(x)
        self.drop_masks[i] = (torch.rand_like(x) < 0).to(x.device)
        if keep_prob > 0:
          scale = 1/keep_prob
        else:
          scale = 0
        x = self.drop_masks[i] * x * scale
        i+=1
  #       self.cacheH.append(x)
      return softmax(torch.mm(x, self.Ws[-1]) + self.bs[-1], dim=1)
  
  # nuevo código Tarea 2
  def backward(self, x, y, y_pred, predict=False):
    current_grad =  (y_pred - y) / y.size(0)

    for i in range(len(self.Ws)-1, 0, -1):
      if self.fs_ps_mask[i-1] is None:
        self.Ws[i].grad = self.fs[i-1](self.cacheU[i-1]).t() @ current_grad
        self.Ws[i].grad *= self.drop_masks[i].to(current_grad.device)
      else:
        self.Ws[i].grad = self.fs[i-1](self.cacheU[i-1], self.fs_ps_mask[i-1].item()).t()  @ current_grad
        self.Ws[i].grad *= self.drop_masks[i].to(current_grad.device)
      self.bs[i].grad = current_grad.sum(dim=0)
      h_grad = current_grad @ self.Ws[i].t()
      
      if self.fs_ps_mask[i-1] is None:
        current_grad = self.fs[i-1](self.cacheU[i-1], gradient=True) * h_grad
      else:
        current_grad, p_grad = self.fs[i-1](self.cacheU[i-1], self.fs_ps_mask[i-1], gradient=True)
        current_grad *= h_grad
        self.fs_ps_mask[i-1].grad = (p_grad * h_grad).sum()
    
    self.Ws[0].grad = (x.t() @ current_grad) * self.drop_masks[0].to(x.device)
    self.bs[0].grad = current_grad.sum(dim=0)

In [93]:
# Otro intento de dropout
class MyDropout(nn.Module):
  def __init__(self, layer_size, keep_prob):
    super(MyDropout, self).__init__()
    self.keep_prob = keep_prob
    self.mask = torch.zeros(size = layer_size)

  def forward(self, x, predict=False):
    if ~predict:
      self.mask = (torch.randn_like(x) < 0)
      if self.keep_prob > 0:
        scale = 1/self.keep_prob
      else:
        scale = 0
      return self.mask * x * scale
    else:
      return x

class FFNN2(nn.Module):
  def __init__(self, F, l_h, l_a, C, keep_prob=None, l_a_params=None):
    super(FFNN2, self).__init__()    
  
    sizes = [F] + l_h + [C]
    self.Ws = ParameterList([get_init_weights((sizes[i], sizes[i+1])) for i in range(len(sizes)-1)])
    self.bs = ParameterList([Parameter(torch.zeros(h)) for h in sizes[1:]])
    self.fs = l_a
    if l_a_params is not None:
      self.fs_ps_mask = [Parameter(torch.tensor(p)) if p else None for p in l_a_params]
    else:
      self.fs_ps_mask = [None for _ in l_a]
    self.fs_ps = ParameterList([p for p in self.fs_ps_mask if p])
    self.keep_prob = keep_prob    
  
    self.drop_masks = [MyDropout(w.shape, p) for w, p in zip(self.Ws, self.keep_prob)]

  @property
  def in_size(self):
    return self.Ws[0].shape[0]
    
  def load_weights(self, Ws, U, bs, c):
    self.Ws = ParameterList([Parameter(W) for W in Ws + [U]])
    self.bs = ParameterList([Parameter(b) for b in bs + [c]])
  
  def resumen(self):
    # Usa self.parameters() o self.named_parameters().
    for name, p in self.named_parameters():
      print('{}:\t{}'.format(name, p.size()))
    pass

  def forward(self, x, predict=False):
    if predict:
      self.cacheU = [] #, self.cacheH = [], []
      for W, b, f, p in zip(self.Ws[:-1], self.bs[:-1], self.fs, self.fs_ps_mask):
        x = torch.mm(x, W) + b
        self.cacheU.append(x)
        x = f(x, p.item()) if p else f(x)
  #       self.cacheH.append(x)
      return softmax(torch.mm(x, self.Ws[-1]) + self.bs[-1], dim=1)
    
    else:
      self.cacheU = [] #, self.cacheH = [], []
      i = 0
      for W, b, f, p, dropMask in zip(self.Ws[:-1], self.bs[:-1], self.fs, self.fs_ps_mask, self.drop_masks):
        x = dropMask.forward(torch.mm(x, W) + b)
        self.cacheU.append(x)
        x = f(x, p.item()) if p else f(x)
      return softmax(torch.mm(x, self.Ws[-1]) + self.bs[-1], dim=1)
  
  # nuevo código Tarea 2
  def backward(self, x, y, y_pred, predict=False):
    current_grad =  (y_pred - y) / y.size(0)

    for i in range(len(self.Ws)-1, 0, -1):
      if self.fs_ps_mask[i-1] is None:
        self.Ws[i].grad = self.drop_masks[i].forward( self.fs[i-1](self.cacheU[i-1]).t() ) @ current_grad
        #self.Ws[i].grad *= self.drop_masks[i].mask.to(current_grad.device)
      else:
        self.Ws[i].grad = self.drop_masks[i].forward( self.fs[i-1](self.cacheU[i-1], self.fs_ps_mask[i-1].item()).t() )  @ current_grad
        # self.Ws[i].grad *= self.drop_masks[i].mask.to(current_grad.device)
      self.bs[i].grad = current_grad.sum(dim=0)
      h_grad = current_grad @ self.Ws[i].t()
      
      if self.fs_ps_mask[i-1] is None:
        current_grad = self.fs[i-1](self.cacheU[i-1], gradient=True) * h_grad
      else:
        current_grad, p_grad = self.fs[i-1](self.cacheU[i-1], self.fs_ps_mask[i-1], gradient=True)
        current_grad *= h_grad
        self.fs_ps_mask[i-1].grad = (p_grad * h_grad).sum()
    
    # self.Ws[0].grad = (x.t() @ current_grad) * self.drop_masks[0].to(x.device)
    self.Ws[0].grad = self.drop_masks[i].forward( (x.t() @ current_grad) )
    self.bs[0].grad = current_grad.sum(dim=0)

In [105]:
# tercer intento de dropout:
# Siguiendo el ejeplo de https://stackoverflow.com/questions/54109617/implementing-dropout-from-scratch
class MyDropout2(nn.Module): 
  def __init__(self, p: float = 0.5):
    super(MyDropout2, self).__init__()
    if p < 0 or p > 1:
      raise ValueError("dropout probability has to be between 0 and 1, " "but got {}".format(p))
    self.p = p

  def forward(self, X, predict=False):
    if ~predict:
      eps = 1e-8
      binomial = torch.distributions.binomial.Binomial(probs=1-self.p)
      return X * binomial.sample(X.size()).to(X.device) * (1.0/((1-self.p)+eps))
    return weights


class FFNN3(nn.Module):
  def __init__(self, F, l_h, l_a, C, keep_prob=None, l_a_params=None):
    super(FFNN3, self).__init__()    
  
    sizes = [F] + l_h + [C]
    self.Ws = ParameterList([get_init_weights((sizes[i], sizes[i+1])) for i in range(len(sizes)-1)])
    self.bs = ParameterList([Parameter(torch.zeros(h)) for h in sizes[1:]])
    self.fs = l_a
    if l_a_params is not None:
      self.fs_ps_mask = [Parameter(torch.tensor(p)) if p else None for p in l_a_params]
    else:
      self.fs_ps_mask = [None for _ in l_a]
    self.fs_ps = ParameterList([p for p in self.fs_ps_mask if p])
    self.keep_prob = keep_prob    
  
    self.drop_masks = [MyDropout2(p) for p in self.keep_prob]

  @property
  def in_size(self):
    return self.Ws[0].shape[0]
    
  def load_weights(self, Ws, U, bs, c):
    self.Ws = ParameterList([Parameter(W) for W in Ws + [U]])
    self.bs = ParameterList([Parameter(b) for b in bs + [c]])
  
  def resumen(self):
    # Usa self.parameters() o self.named_parameters().
    for name, p in self.named_parameters():
      print('{}:\t{}'.format(name, p.size()))
    pass

  def forward(self, x, predict=False):
    if predict:
      self.cacheU = [] #, self.cacheH = [], []
      for W, b, f, p in zip(self.Ws[:-1], self.bs[:-1], self.fs, self.fs_ps_mask):
        x = torch.mm(x, W) + b
        self.cacheU.append(x)
        x = f(x, p.item()) if p else f(x)
  #       self.cacheH.append(x)
      return softmax(torch.mm(x, self.Ws[-1]) + self.bs[-1], dim=1)
    
    else:
      self.cacheU = [] #, self.cacheH = [], []
      i = 0
      for W, b, f, p, dropMask in zip(self.Ws[:-1], self.bs[:-1], self.fs, self.fs_ps_mask, self.drop_masks):
        x = dropMask.forward(torch.mm(x, W) + b)
        self.cacheU.append(x)
        x = f(x, p.item()) if p else f(x)
      return softmax(torch.mm(x, self.Ws[-1]) + self.bs[-1], dim=1)
  
  # nuevo código Tarea 2
  def backward(self, x, y, y_pred, predict=False):
    current_grad =  (y_pred - y) / y.size(0)

    for i in range(len(self.Ws)-1, 0, -1):
      if self.fs_ps_mask[i-1] is None:
        self.Ws[i].grad = self.drop_masks[i].forward( self.fs[i-1](self.cacheU[i-1]).t() ) @ current_grad
        #self.Ws[i].grad *= self.drop_masks[i].mask.to(current_grad.device)
      else:
        self.Ws[i].grad = self.drop_masks[i].forward( self.fs[i-1](self.cacheU[i-1], self.fs_ps_mask[i-1].item()).t() )  @ current_grad
        # self.Ws[i].grad *= self.drop_masks[i].mask.to(current_grad.device)
      self.bs[i].grad = current_grad.sum(dim=0)
      h_grad = current_grad @ self.Ws[i].t()
      
      if self.fs_ps_mask[i-1] is None:
        current_grad = self.fs[i-1](self.cacheU[i-1], gradient=True) * h_grad
      else:
        current_grad, p_grad = self.fs[i-1](self.cacheU[i-1], self.fs_ps_mask[i-1], gradient=True)
        current_grad *= h_grad
        self.fs_ps_mask[i-1].grad = (p_grad * h_grad).sum()
    
    # self.Ws[0].grad = (x.t() @ current_grad) * self.drop_masks[0].to(x.device)
    self.Ws[0].grad = self.drop_masks[i].forward( (x.t() @ current_grad) )
    self.bs[0].grad = current_grad.sum(dim=0)

In [103]:
X = torch.rand(1, 10).to('cuda')
p = 0.5


print( X * (torch.randn_like(X)>p))

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]], device='cuda:0')


In [106]:
# Tests del API del curso
torch.manual_seed(0)
sample = torch.rand(1, 10)
red = FFNN3(10, [1000], [sig], 1, keep_prob=[1.0, 0.5])
y = red.forward(sample, predict=False)
output_mask = (y == 0)
percent = torch.sum(output_mask).item() / list(output_mask.size())[-1]

# Submit
corrector.submit(homework=3, question="1b", test=1, token=token, answer=percent)



Correct Test!


## 1c) Entrenamiento y generalización sobre MNIST 

In [72]:
# Tu código acá
def CELoss(Q, P, estable=True, epsilon=1e-8):
  N = Q.shape[0]
  if estable:
    Q = Q.clamp(epsilon, 1-epsilon)
  return -(P * Q.log()).sum()/N

In [88]:
import sys

# Tu código acá
def entrenar_FFNN(red, dataset, optimizador, epochs=1, batch_size=1, reports_every=1, device='cuda'):
  device = device
  red.to(device)
  data = DataLoader(dataset, batch_size, shuffle=True)
  total=len(dataset)
  tiempo_epochs = 0
  loss, acc = [], []
  for e in range(1, epochs+1):  
    inicio_epoch = timer()
    
    for x, y in data:
      x, y = x.view(x.size(0), -1).float().to(device), y.to(device)
      
      y_pred = red.forward(x,predict=False)
      
      y_onehot = torch.zeros_like(y_pred)
      y_onehot[torch.arange(x.size(0)), y] = 1.
    
      red.backward(x, y_onehot, y_pred)

      optimizador.step()
      
    tiempo_epochs += timer() - inicio_epoch
    
    if e % reports_every == 0:
      X = dataset.data.view(len(dataset), -1).float().to(device)
      Y = dataset.targets.to(device)
      
      Y_PRED = red.forward(X, predict = True).to(device)
      
      Y_onehot = torch.zeros_like(Y_PRED)
      Y_onehot[torch.arange(X.size(0)), Y] = 1.

      L_total = CELoss(Y_PRED, Y_onehot)
      loss.append(L_total)
      diff = Y-torch.argmax(Y_PRED,1)
      errores = torch.nonzero(diff).size(0)
      
      Acc=100*(total-errores)/total
      acc.append(Acc)

      sys.stdout.write(
            '\rEpoch:{0:03d}'.format(e) + ' Acc:{0:.2f}%'.format(Acc)
            + ' Loss:{0:.4f}'.format(L_total) 
            + ' Tiempo/epoch:{0:.3f}s'.format(tiempo_epochs/e))
  
  return loss, acc

In [89]:
import matplotlib.pyplot as plt

def plot_results(loss, acc):
  f1 = plt.figure(1)
  ax1 = f1.add_subplot(111)
  ax1.set_title("Loss")    
  ax1.set_xlabel('epochs')
  ax1.set_ylabel('loss')
  ax1.plot(loss, c='r')
  f1.show()

  f2 = plt.figure(2)
  ax2 = f2.add_subplot(111)
  ax2.set_title("Accuracy")    
  ax2.set_xlabel('epochs')
  ax2.set_ylabel('acc')
  ax2.plot(acc, c='b')
  f2.show()

In [90]:
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor
from torch.utils.data import Dataset, DataLoader

# Importamos funcionalidades útiles para mirar los datos.
from matplotlib.pyplot import subplots
from random import randint

# Descarga y almacena el conjunto de entrenamiento de MNIST.
mnist_dataset = MNIST('mnist', train=True, transform=ToTensor(), download=True)
print('Cantidad total de datos:',len(mnist_dataset))

Cantidad total de datos: 60000


In [108]:
import os
from numpy import loadtxt

# if not os.path.exists('/content/CC6204'):
#   !git clone https://github.com/dccuchile/CC6204.git

# dir_path = '/content/CC6204/2020/tareas/tarea1/mnist_weights'
# Ws = [torch.from_numpy(loadtxt(os.path.join(dir_path, '{}.txt'.format(name)))).float() for name in ['W1', 'W2']]
# bs = [torch.from_numpy(loadtxt(os.path.join(dir_path, '{}.txt'.format(name)))).float() for name in ['b1', 'b2']]
# U = torch.from_numpy(loadtxt(os.path.join(dir_path, 'U.txt'))).float()
# c = torch.from_numpy(loadtxt(os.path.join(dir_path, 'c.txt'))).float()

# mnist_model = FFNN(784, [32, 16], [relu, relu], 10)
# mnist_model.load_weights(Ws, U, bs, c)

mnist_model = FFNN2(784, [512, 1024,  128], [relu, relu,  relu], 10, keep_prob=[1, .5, .5, .25])

mnist_optimizer = SGD(mnist_model.parameters(), lr=1e-5)
with torch.no_grad():
  mnist_loss, mnist_acc = entrenar_FFNN(mnist_model, mnist_dataset, mnist_optimizer, epochs=30, batch_size=32)



Epoch:030 Acc:9.43% Loss:16.6830 Tiempo/epoch:14.838s

# Parte 2: Optimización

## 2a) Inicialización de Xavier/He

Para los test de esta parte vamos a necesitar que modifiques tu código para que se pueda entregar valores predeterminados de `r`. Ahora tu código para las inicializaciones debe ser: `xavier_init(first_dim, second_dim, r=None)`, `he_init(first_dim, second_dim, r=None)`.

In [92]:
# Tu código debiera continuar como sigue
def xavier_init(first_dim, second_dim, r=None):
  from math import sqrt
  factor = sqrt(1/(first_dim))

  if r==None:
    return factor * torch.normal(mean=0, std=1, size=(first_dim,second_dim))
  else:
    return torch.tensor(factor) * r

def he_init(first_dim, second_dim, r=None):
  from math import sqrt
  factor = sqrt(2/(first_dim))

  if r==None:
    return factor * torch.normal(mean=0, std=1, size=(first_dim,second_dim))
  else:
    return torch.tensor(factor) * r


In [26]:
# Tests del API del curso
r_xavier = corrector.get_test_data(homework=3, question="2a", test=1, token=token)
r_he = corrector.get_test_data(homework=3, question="2a", test=2, token=token)

w_xavier = xavier_init(50, 50, torch.tensor(r_xavier))
w_he = he_init(50, 50, torch.tensor(r_he))

corrector.submit(homework=3, question="2a", test=1, token=token, answer=w_xavier)
corrector.submit(homework=3, question="2a", test=2, token=token, answer=w_he)

Using cached test data
Using cached test data
Correct Test!
Correct Test!


## 2b) Descenso de gradiente con momentum

In [54]:
# Tu código debiera continuar así

class SGD():
  def __init__(self, parameters, lr, momentum=0):
    # lo que sea necesario inicializar
    self.params = [p for p in parameters if p is not None]
    self.speed = [torch.zeros_like(p) for p in parameters if p is not None]
    self.lr = lr
    self.momentum = momentum
    
  
  def step(self):
    # actualiza acá los parámetros a partir de los gradientes
    # y considerando el valor de momentum que acabámos de agregar
    for p, s in zip(self.params, self.speed):
      s.data = self.momentum * s.data - self.lr * p.grad
      p.data += s.data

In [55]:
# Tests del API del curso
weight, grad = corrector.get_test_data(homework=3, question="2b", test=1, token=token)

weight = torch.tensor(weight, requires_grad=True)
weight.grad = torch.tensor(grad)

optimizer = SGD([weight], lr=0.1, momentum=0.9)
optimizer.step()

# Submit
corrector.submit(homework=3, question="2b", test=1, token=token, answer=weight)
optimizer.step()
corrector.submit(homework=3, question="2b", test=2, token=token, answer=weight)

Using cached test data
Correct Test!
Correct Test!


## 2c) RMSProp

In [50]:
# Tu código acá

class RMSProp():
  def __init__(self, red, lr=0.001, beta=0.9, epsilon=1e-8):
    # en este caso debes inicializar la variable que acumula
    # el promedio exponencial de los cuadrados
    # lo que sea necesario inicializar
    self.params = [p for p in red if p is not None]
    self.speed = [torch.zeros_like(p) for p in red if p is not None]
    self.lr = lr
    self.beta = beta
    self.epsilon = epsilon
  
  def step(self):
    # actualiza acá los parámetros a partir de los gradientes
    # y la corrección según S
    for p, s in zip(self.params, self.speed):
      s.data = self.beta * s.data + (1-self.beta)*(p.grad * p.grad)
      p.data -= self.lr*(1/(torch.sqrt(s.data)+ self.epsilon))*p.grad

In [51]:
# Tests del API del curso
weight, grad = corrector.get_test_data(homework=3, question="2c", test=1, token=token)

weight = torch.tensor(weight, requires_grad=True)
weight.grad = torch.tensor(grad)

optimizer = RMSProp([weight], lr=0.001, beta=0.9, epsilon=1e-8)
optimizer.step()

# Submit
corrector.submit(homework=3, question="2c", test=1, token=token, answer=weight)
optimizer.step()
corrector.submit(homework=3, question="2c", test=2, token=token, answer=weight)

Using cached test data
Correct Test!
Correct Test!


## 2d) Adam

In [117]:
# Tu código acá

class Adam():
  def __init__(self, red, lr=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
    # en este caso debes inicializar la variable que acumula
    # el promedio exponencial de los cuadrados
    self.params = [p for p in red if p is not None]
    self.speed = [torch.zeros_like(p) for p in red if p is not None]
    self.moments = [torch.zeros_like(p) for p in red if p is not None]

    self.lr = lr
    self.beta1 = beta1
    self.beta2 = beta2
    self.epsilon = epsilon

    self.n = 1
  
  def step(self):
    # actualiza acá los parámetros a partir de los gradientes
    # y la corrección según S
    for p, s, m in zip(self.params, self.speed, self.moments):
      m.data = self.beta1*m.data + (1-self.beta1)*p.grad
      s.data = self.beta2 * s.data + (1-self.beta2)*(p.grad * p.grad)

      m_mean = m.data / (1-self.beta1**self.n)
      s_mean = s.data / (1-self.beta2**self.n)
      p.data -= self.lr*(1/(torch.sqrt(s_mean)+ self.epsilon))*m_mean
    self.n += 1

In [118]:
# Tests del API del curso
weight, grad = corrector.get_test_data(homework=3, question="2d", test=1, token=token)

weight = torch.tensor(weight, requires_grad=True)
weight.grad = torch.tensor(grad)

optimizer = Adam([weight], lr=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8)
optimizer.step()

# Submit
corrector.submit(homework=3, question="2d", test=1, token=token, answer=weight)
optimizer.step()
corrector.submit_check_some(homework=3, question="2d", tests=[2, 3], token=token,
                            answer_dict={2: weight, 3: weight}, required_number=1)

Using cached test data
Correct Test!
Correct Test!


## 2e) Entrenamiento en MNIST 

Usa tu red neuronal para entrenar con los datos de MNIST y compara cómo cambian las curvas de entrenamiento dependiendo de factores como la inicialización y los algoritmos que utilices. Presenta al menos dos gráficos en donde compares. Por ejemplo, puedes presentar uno que para la misma estrategia de inicialización, los tres algoritmos de optimización para varias épocas y cómo evoluciona la pérdida y el acierto. En cada caso comenta que conclusiones puedes sacar. Algunos ejemplos de preguntas que podrías tratar de responder son:
* ¿cómo afecta el algoritmo de optimización al tiempo de convergencia de la red para los datos de entrenamiento?
* ¿cómo afecta el algoritmo de optimización en el acierto alcanzado por la red en los datos de prueba?
* Si haces la parte opcional de Batch Normalization, puedes también preguntarte cosas como si aplicar, o no, BN afecta a todos los algoritmos de optimización por igual.

In [120]:
# Aqui el codigo para entrenar en MNIST
mnist_model = FFNN2(784, [512, 1024,  128], [relu, relu,  relu], 10, keep_prob=[1, .5, .5, .25])

mnist_optimizer = Adam(mnist_model.parameters(), lr=1e-5)
with torch.no_grad():
  mnist_loss, mnist_acc = entrenar_FFNN(mnist_model, mnist_dataset, mnist_optimizer, epochs=30, batch_size=32)



Epoch:030 Acc:10.33% Loss:16.5175 Tiempo/epoch:14.815s

## Sobre los resultados obtenidos:
Notemos que los resultados obtenidos son pesimos. A pesar de tener tres implementaciones distintas de la capa de `DropOut` (que se cree es lo que esta mal implementado) no se logro superar el  $~15\%$ de _accuracy_ y las perdidas era muy altas. 

# Parte 3 (Opcional): Batch Normalization

In [None]:
# Tu código debiera continuar como sigue

class FFNN():
  def __init__(self, F, l_h, l_a, C, keep_prob=None, bn=None):
    # debes crear los parámetros necesarios para las capas de
    # batch normalizacion
    pass
  
  def forward(x, predict=False):
    # debes modificar esta función para considerar las capas para las que se
    # usará batch normalization
    # también debes preocuparte de guardar los datos estadísticos que se
    # usaran en tiempo de test (predict=True)
    pass
  
  def backward(x,y,y_pred):
    # computar acá todos los gradientes considerando las capas de 
    # batch normalization
    # no olvides considerar los nuevos parámetros entrenables.
    pass