In [2]:
import torch

In [3]:
import numpy as np
# https://towardsdatascience.com/how-to-implement-an-adam-optimizer-from-scratch-76e7b217f1cc
class AdamOptim():
    def __init__(self, eta=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.m_dw, self.v_dw = 0, 0
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.eta = eta
    def update(self, t, w, dw):
        ## dw are from current minibatch
        ## momentum beta 1
        # *** weights *** #
        self.m_dw = self.beta1*self.m_dw + (1-self.beta1)*dw

        ## rms beta 2
        # *** weights *** #
        self.v_dw = self.beta2*self.v_dw + (1-self.beta2)*(dw**2)

        ## bias correction
        m_dw_corr = self.m_dw/(1-self.beta1**t)
        v_dw_corr = self.v_dw/(1-self.beta2**t)
        ## update weights
        w = w - self.eta*(m_dw_corr/(np.sqrt(v_dw_corr)+self.epsilon))
        return w

In [4]:
class Net(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.projection = torch.nn.Linear(3, 2, bias=False)
        torch.nn.init.constant_(self.projection.weight, 0.5)
        self.act = torch.nn.SiLU()
    def forward(self, x):
        z = self.act(self.projection(x))
        return z

X = torch.randint(0, 10, (10, 3)).float()
Y = torch.randint(0, 10, (10, 2)).float()


# batch_size=10, lr=0.01

In [5]:
optr = torch.optim.SGD

In [17]:
net = Net()
opt = optr(net.parameters(), lr=0.01)
net.train()
opt.zero_grad()
logit = net(X[:5])
loss = torch.nn.functional.mse_loss(logit, Y[:5])
(loss/2).backward()
logit = net(X[5:])
loss = torch.nn.functional.mse_loss(logit, Y[5:])
(loss/2).backward()
opt.step()
print(net.projection.weight.grad)
print(net.projection.weight)

tensor([[23.3192, 15.8141, 17.0207],
        [10.1205,  9.3564, 11.1408]])
Parameter containing:
tensor([[0.2668, 0.3419, 0.3298],
        [0.3988, 0.4064, 0.3886]], requires_grad=True)


In [16]:
net = Net()
opt = optr(net.parameters(), lr=0.01)
net.train()
opt.zero_grad()
logit = net(X)
loss = torch.nn.functional.mse_loss(logit, Y)
loss.backward()
opt.step()
print(net.projection.weight.grad)
print(net.projection.weight)

tensor([[23.3192, 15.8141, 17.0207],
        [10.1205,  9.3564, 11.1408]])
Parameter containing:
tensor([[0.2668, 0.3419, 0.3298],
        [0.3988, 0.4064, 0.3886]], requires_grad=True)


In [11]:
net = Net()
net.train()
logit = net(X)
loss = torch.nn.functional.mse_loss(logit, Y)
loss.backward()
print(net.projection.weight.grad)
print("SGD:")
print(net.projection.weight.data - 0.01*net.projection.weight.grad)
print("Adam:")
print(AdamOptim(eta=0.01).update(1, net.projection.weight.data.numpy(), net.projection.weight.grad.data.numpy())) 

tensor([[23.3192, 15.8141, 17.0207],
        [10.1205,  9.3564, 11.1408]])
SGD:
tensor([[0.2668, 0.3419, 0.3298],
        [0.3988, 0.4064, 0.3886]])
Adam:
[[0.49 0.49 0.49]
 [0.49 0.49 0.49]]


# batch_size=1, lr=0.001, DO NOT USE

In [13]:
net = Net()
opt = optr(net.parameters(), lr=0.001) # only for sgd, not for adam. For adam, it's funny that loss/batch vs loss is the same..
net.train()
opt.zero_grad()
for x, y in zip(X, Y):
    logit = net(x)
    loss = torch.nn.functional.mse_loss(logit, y)
    loss.backward()
opt.step()
print(net.projection.weight.grad)
print(net.projection.weight)

tensor([[233.1915, 158.1407, 170.2069],
        [101.2046,  93.5642, 111.4084]])
Parameter containing:
tensor([[0.2668, 0.3419, 0.3298],
        [0.3988, 0.4064, 0.3886]], requires_grad=True)


# batch_size=1, lr=0.01, loss/10, USE THIS!!!

In [14]:
torch.manual_seed(42)
net = Net()
opt = optr(net.parameters(), lr=0.01)
net.train()
opt.zero_grad()
for x, y in zip(X, Y):
    logit = net(x)
    loss = torch.nn.functional.mse_loss(logit, y)/10 # this will lead to problematic metric
    loss.backward()
opt.step()
print(net.projection.weight._grad)
print(net.projection.weight)

tensor([[23.3191, 15.8141, 17.0207],
        [10.1205,  9.3564, 11.1408]])
Parameter containing:
tensor([[0.2668, 0.3419, 0.3298],
        [0.3988, 0.4064, 0.3886]], requires_grad=True)
