In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt

In [56]:
class Gradient_descent:
  def __init__(self,param,lr=0.001):
    self.lr = lr
    self.param = list(param)

  def zero_grad(self):
    for par in self.param:
      if par.grad is not None:
        par.grad.zero_()

  def step(self):
    with torch.no_grad():
      for par in self.param:
        if par.grad is not None:
          par -= self.lr*par.grad



In [48]:
class MomentumGradientDescent:
  def __init__(self,param,lr=0.01,momentum=0.9):
    self.lr = lr
    self.param = list(param)
    self.velocities = [torch.zeros_like(p) for p in self.param]
    self.momentum = momentum
  def zero_grad(self):
    for par in self.param:
      if par.grad is not None:
        par.grad.zero_()
  def step(self):
    with torch.no_grad():
      for i,par in enumerate(self.param):
        if par.grad is not None:
          self.velocities[i] = self.momentum*self.velocities[i] + par.grad
          par -= self.lr * self.velocities[i]


Nesterov Accelerated Gradient->
   ~ updates the velocity based on where this (current) gradient is about to take us
   ~ steps :
          look_ahead(Θ`) = Θt - B*V(t-1)
          Gradient_at_Θ` (gt) = L(Θ`)
          velocity = B*V(t-1) + n*gt
          Θt+1 = Θt - velocity


In [49]:
class NAGD:
  def __init__(self,param,lr=0.001,momentum=0.9):
    self.lr = lr
    self.param = list(param)
    self.momentum = momentum
    self.velocities = [torch.zeros_like(p) for p in self.param]
  def zero_grad(self):
    for par in self.param:
      if par.grad is not None:
        par.grad.zero_()
  def step(self):
    with torch.no_grad():
      for i,par in enumerate(self.param):
        if par.grad is not None:
          v_prev =self.velocities[i]
          self.velocities[i] = self.momentum*self.velocities[i] + self.lr*par.grad

          par -= self.momentum*v_prev + self.velocities[i]

We dont take Gradient at Look ahead because its computationally expensive to compute the forward pass again with the look_ahead term ,

Adaptive Learning Rate Methods


1.   accumulates sum of square Gradients
2.   Uses this to control learning rate , e.g.;  if Gradients grow faster , the effective learning rate becomes smaller (learning_rate/sum_squared_gradeints) , and if gradient are too small , then effective lr is high.




In [50]:
class AdaGrad:
  def __init__(self,param,lr=0.01,eps=1e-6):
    self.lr = lr
    self.param = list(param)
    self.eps = eps
    self.G = [torch.zeros_like(p) for p in self.param]

  def zero_grad(self):
    for par in self.param:
      if par.grad is not None:
        par.grad.zero_()

  def step(self):
    with torch.no_grad():
      for i,par in enumerate(self.param):
        self.G[i] += par.grad**2
        adaptive_learning = self.lr/(torch.sqrt(self.G[i]+self.eps))
        par -= par.grad*adaptive_learning


RMSProp (Root Mean Squared Propogation)


1.   Hadles the aggressive decay of params like in adagrad by taking exponential moving average of squared gradeints
2.   EG = EG*b + (1-b)*(G**2)
     effective_lr = lr/sqrt(EG) + eps
     grad -= effective_lr*grad(i-1)



In [51]:
class RMSProp:
  def __init__(self,param,lr=0.001,beta=0.9,eps=1e-8):
    self.param = list(param)
    self.lr = lr
    self.b = beta
    self.eps = eps
    self.G = [torch.zeros_like(p) for p in self.param]

  def zero_grad(self):
    for par in self.param:
      if par.grad is not None:
        par.grad.zero_()

  def step(self):
    with torch.no_grad():
      for i,par in enumerate(self.param):
        if par.grad is not None:
          self.G[i] = self.G[i]*self.b + (1-self.b)*(par.grad**2)
          adaptive_learning = self.lr/(torch.sqrt(self.G[i])+self.eps)
          par -= adaptive_learning*par.grad


AdaDelta Combines AdaGRad and RMSProp , with generating an effective learning rate through the ratio of exponential moving average of squared params to  squared gradeints


1.   EG = rho*EG + (1-rho)*G^2
2.   rms_param = sqrt(EP+eps)
3.   rms_grad = sqrt(EG+eps)
4.   lr = rms_param/rms_grad
5.   delta = -lr*param.grad
6.   param += delta
EG - exponential moving average of gradeitns, EP - exponential moving average of params



In [52]:
class AdaDelta:
  def __init__(self,param,rho=0.9,eps=1e-8):
    self.rho = rho
    self.param = list(param)
    self.eps = eps
    self.param_av = [torch.zeros_like(p) for p in self.param]
    self.gradient_av = [torch.zeros_like(p) for p in self.param]

  def zero_grad(self):
    for par in self.param:
      if par.grad is not None:
        par.grad.zero_()

  def step(self):
    with torch.no_grad():
      for i,par in enumerate(self.param):
        if par.grad is not None:
          self.gradient_av[i] = self.gradient_av[i]*self.rho + (1-self.rho)*(par.grad**2)
          rms_par = torch.sqrt(self.param_av[i]+self.eps)
          rms_grad= torch.sqrt(self.gradient_av[i]+self.eps)
          effective_lr = rms_par/rms_grad
          delta = -effective_lr*par.grad
          self.param_av[i] = self.rho*self.param_av[i] + (1-self.rho)*(delta**2)
          par += delta


Adam
adam combines the adaptive learning rate technique from RMS prop and Fast cponvergences of Moment based gradeints


1.   m = b1*m + (1-b1)*grad
2.   v = b2*v + (1-b2)*(grad**2)
3.   m_hat = m/(1- b1^t)
4.   v_hat = v/(1- b2^t)
5.   lr = lr/sqrt(v_hat) + eps // effetive learning rate via RMS prop
6.   par -= lr*m_hat


In [53]:
class Adam:
  def __init__(self,param,lr=0.001,b1=0.9,b2=0.999,eps=1e-8):
    self.param = list(param)
    self.lr = lr
    self.b1 = b1
    self.b2 = b2
    self.eps = eps
    self.m = [torch.zeros_like(p) for p in self.param]
    self.v = [torch.zeros_like(p) for p in self.param]
    self.t = 0

  def zero_grad(self):
    for par in self.param:
      if par.grad is not None:
        par.grad.zero_()

  def step(self):
    self.t += 1
    with torch.no_grad():
      for i,par in enumerate(self.param):
        if par.grad is not None:
          self.m[i] = self.m[i]*self.b1 + (1-self.b1)*par.grad
          self.v[i] = self.v[i]*self.b2 + (1-self.b2)*(par.grad**2)

          m_c = self.m[i]/(1 - self.b1**self.t)
          v_c = self.v[i]/(1-self.b2**self.t)
          effective_lr = self.lr/(torch.sqrt(v_c)+self.eps)
          par -= effective_lr*m_c




In [54]:
class AdamW:
  def __init__(self,param,lr=0.001,b1=0.9,b2=0.999,eps=1e-8,weight_decay=0.01):
    self.param = param
    self.lr = lr
    self.b1 = b1
    self.b2 = b2
    self.eps = eps
    self.m = [torch.zeros_like(p) for p in self.param]
    self.v = [torch.zeros_like(p) for p in self.param]
    self.t = 0
    self.weight_decay = weight_decay

  def zero_grad(self):
    for par in self.param:
      if par.grad is not None:
        par.grad.zero_()

  def step(self):
    with torch.no_grad():
      for i,par in enumerate(self.param):
        self.m[i] = self.m[i]*self.b1 + (1-self.b1)*par.grad
        self.v[i] = self.v[i]*self.b2 + (1-self.b2)*(par.grad**2)

        m_c = self.m[i]/(1 - self.b1**self.t)
        v_c = self.v[i]/(1-self.b2**self.t)
        updative_update = m_c/(torch.sqrt(v_c)+self.eps)
        par -= self.lr*(updative_update + self.weight_decay*par)




In [11]:
from sklearn.datasets import make_circles

In [14]:
X,y = make_circles()

In [17]:
X.shape

(100, 2)

In [18]:
y.shape

(100,)

In [19]:
y

array([1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0])

In [20]:
X_train,X_test = X[:70,],X[70:,]
Y_train,Y_test = y[:70],y[70:]

In [21]:
X_train.shape

(70, 2)

In [24]:
X_train = torch.tensor(X_train,dtype=torch.float)
X_test = torch.tensor(X_test,dtype=torch.float)
Y_train = torch.tensor(Y_train,dtype=torch.long)
Y_test = torch.tensor(Y_test,dtype=torch.long)

In [76]:
class NN(nn.Module):
  def __init__(self):
    super().__init__()
    self.layers = nn.Sequential(
        nn.Linear(2,64),
        nn.ReLU(),
        nn.BatchNorm1d(64),
        nn.Linear(64,32),
        nn.ReLU(),
        nn.BatchNorm1d(32),
        nn.Linear(32,16),
        nn.ReLU(),
        nn.BatchNorm1d(16),
        nn.Linear(16,2)

    )
  def forward(self,x):
    return self.layers(x)



In [77]:
def train_loop(opt,X_train,Y_train,X_test,Y_test,model):
  loss_func = nn.CrossEntropyLoss()
  epoch = 0
  training_loss = []
  test_loss = []
  training_acc = []
  test_acc =[]
  while epoch <=100:
    model.train()

    outs  = model(X_train)
    los = loss_func(outs,Y_train)
    training_loss.append(los.item())
    opt.zero_grad()
    los.backward()
    opt.step()
    _,preds = torch.max(outs,1)
    acc = (preds == Y_train).sum().item()/X_train.shape[0]
    training_acc.append(acc)


    model.eval()
    with torch.no_grad():
      outs = model(X_test)
      _,preds = torch.max(outs,1)
      los = loss_func(outs,Y_test)
      test_loss.append(los.item())
      ac2 = (preds == Y_test).sum().item()/X_test.shape[0]
      test_acc.append(ac2)
      epoch += 1
  for i in range(1,6):

    print(f'training loss at epoch {i*20} is {training_loss[i*20]} and accuracy is {training_acc[i*20]}',end=' ')
    print(f' || testing loss at epoch {i*20} is {test_loss[i*20]} and accuracy is {test_acc[i*20]}')






In [78]:
model = NN()

In [79]:
nn_optimizers = {
    'Vanilla GD': Gradient_descent(model.parameters(),lr=0.01),
    'Momentum': MomentumGradientDescent(model.parameters(),lr=0.01,momentum=0.9),
    'Adam': Adam(model.parameters(),lr=0.03),
    'AdamW': AdamW(model.parameters(),lr=0.03),
    'RMSProp': RMSProp(model.parameters()),
}

In [81]:
for d in list(nn_optimizers.keys()):
  print(f' Results for optimizer {d}')
  train_loop(nn_optimizers[d],X_train,Y_train,X_test,Y_test,model)
  print('*********************************************************************************************************************************')


 Results for optimizer Vanilla GD
training loss at epoch 20 is 8.327617138093046e-07 and accuracy is 1.0  || testing loss at epoch 20 is 4.386856289784191e-06 and accuracy is 1.0
training loss at epoch 40 is 8.327617138093046e-07 and accuracy is 1.0  || testing loss at epoch 40 is 4.363014795671916e-06 and accuracy is 1.0
training loss at epoch 60 is 8.327617138093046e-07 and accuracy is 1.0  || testing loss at epoch 60 is 4.35904121331987e-06 and accuracy is 1.0
training loss at epoch 80 is 8.327617138093046e-07 and accuracy is 1.0  || testing loss at epoch 80 is 4.35904121331987e-06 and accuracy is 1.0
training loss at epoch 100 is 8.327617138093046e-07 and accuracy is 1.0  || testing loss at epoch 100 is 4.35904121331987e-06 and accuracy is 1.0
*********************************************************************************************************************************
 Results for optimizer Momentum
training loss at epoch 20 is 7.510182626901951e-07 and accuracy is 1.0  || testi