In [1]:
import numpy as np

In [2]:
# Gradient Descent
def df_w(W):
    """
    Arguments:
    W -- np.array[w1, w2]
    Returns:
    dW -- np.array[dw1, dw2]
    """
    w1, w2 = W[0], W[1]
    dW1, dW2 = 0.2*w1, 4*w2
    dW = np.array([dW1, dW2])    
    return dW

def sgd(W, dW, lr):
    """_summary_

    Args:
        W (np.array): [w1, w2]
        dW (np.array): [dw1, dw2]
        lr (float): learning_rate
        
    Returns:
        W (np.array): [w1, w2] after update
    """
    W = W - lr * dW
    return W

def train_pl(optimizer, lr, epochs):
    """_summary_

    Args:
        optimizer (function): function optimizer
        lr (float): learning_rate
        epochs (int): number of loop
    Returns:
        results (list): list of [w1, w2] after each epoch of update
    """
    
    W = np.array([-5, -2], dtype=np.float32)
    results = [W]
    for _ in range(epochs):
        dW = df_w(W)
        new_W = optimizer(W=W, dW=dW, lr=lr)
        results.append(new_W)
        W = new_W
    return results

In [3]:
# Question 5: [-4.232, -0.72] -> A
results = train_pl(optimizer=sgd, lr=0.4, epochs=2)
print(results)
# Question 6: [-4.09831018e-01, -4.42147839e-07] -> D
results = train_pl(optimizer=sgd, lr=0.4, epochs=30)
print(results)

[array([-5., -2.], dtype=float32), array([-4.6,  1.2]), array([-4.232, -0.72 ])]
[array([-5., -2.], dtype=float32), array([-4.6,  1.2]), array([-4.232, -0.72 ]), array([-3.89344,  0.432  ]), array([-3.5819648, -0.2592   ]), array([-3.29540762,  0.15552   ]), array([-3.03177501, -0.093312  ]), array([-2.78923301,  0.0559872 ]), array([-2.56609437, -0.03359232]), array([-2.36080682,  0.02015539]), array([-2.17194227, -0.01209324]), array([-1.99818689,  0.00725594]), array([-1.83833194, -0.00435356]), array([-1.69126538,  0.00261214]), array([-1.55596415, -0.00156728]), array([-1.43148702e+00,  9.40369969e-04]), array([-1.31696806e+00, -5.64221981e-04]), array([-1.21161061e+00,  3.38533189e-04]), array([-1.11468176e+00, -2.03119913e-04]), array([-1.02550722e+00,  1.21871948e-04]), array([-9.43466646e-01, -7.31231688e-05]), array([-8.67989314e-01,  4.38739013e-05]), array([-7.98550169e-01, -2.63243408e-05]), array([-7.34666155e-01,  1.57946045e-05]), array([-6.75892863e-01, -9.47676268e-06

In [4]:
# Gradient Descent + Momentum
def train_gdm(W, V, lr, epochs, beta):
    results = [W]
    for _ in range(epochs):
        dW1 = 0.2*W[0]
        dW2 = 4*W[1]
        dW = np.array([dW1, dW2])
        v1 = beta*V[0] + (1-beta)*dW[0]
        v2 = beta*V[1] + (1-beta)*dW[1]
        V = np.array([v1, v2])
        newW = W - lr*V
        results.append(newW)
        W = newW
    return results

In [5]:
# Question 7: [-4.268,  1.12] -> C
W = np.array([-5, -2])
V = np.array([0, 0])
lr = 0.6
beta = 0.5
results = train_gdm(W, V, lr=lr, beta=beta, epochs=2)
print(results)
# Question 8: [-6.10072592e-02  6.45162933e-05] -> D
results = train_gdm(W, V, lr=lr, beta=beta, epochs=30)
print(results)
print(results[-1])

[array([-5, -2]), array([-4.7,  0.4]), array([-4.268,  1.12 ])]
[array([-5, -2]), array([-4.7,  0.4]), array([-4.268,  1.12 ]), array([-3.79592,  0.136  ]), array([-3.3321248, -0.5192   ]), array([-2.90029971, -0.22376   ]), array([-2.51036919,  0.192472  ]), array([-2.16478177,  0.1696216 ]), array([-1.86210116, -0.04534952]), array([-1.59903478, -0.09841566]), array([-1.37155951, -0.00684994]), array([-1.1755283 ,  0.04715285]), array([-1.006981  ,  0.01757082]), array([-0.86228849, -0.01830518]), array([-0.73820492, -0.01427696]), array([-0.63187084,  0.0048695 ]), array([-0.54079155,  0.00859933]), array([-4.62804416e-01,  1.45050014e-04]), array([-0.39604258, -0.00425615]), array([-0.33889911, -0.00134937]), array([-0.28999343,  0.00172326]), array([-0.24814098,  0.00119166]), array([-0.2123263 , -0.00050413]), array([-0.18167938, -0.00074707]), array([-1.55455157e-01,  2.79448010e-05]), array([-0.13301574,  0.00038192]), array([-1.13815082e-01,  1.00603444e-04]), array([-0.097385

In [6]:
# RMSProp
def train_rms(W, S, decay_rate, lr, epochs, constant_rate):
    results = [W]
    for _ in range(epochs):
        dW1 = 0.2*W[0]
        dW2 = 4*W[1]
        dW = np.array([dW1, dW2])
        S = decay_rate * S + (1 - decay_rate) * (dW ** 2)
        grad = dW / (np.sqrt((S + constant_rate)))
        W = W - lr * grad
        results.append(W.copy())
    return results

In [7]:
# Question 11: [-3.43519754, -0.59152343] -> D
W = np.array([-5, -2])
S = np.array([0, 0])
lr = 0.3
decay_rate = 0.9
constant_rate = 0.000001
results = train_rms(W, S, decay_rate=decay_rate, lr=lr, constant_rate=constant_rate, epochs=2)
print(results)
# Question 12: [-3.00577081e-03 -3.00506084e-17] -> B
results = train_rms(W, S, decay_rate=decay_rate, lr=lr, constant_rate=constant_rate, epochs=30)
print(results)
print(results[-1])

[array([-5, -2]), array([-4.05132145, -1.05131678]), array([-3.43519754, -0.59152343])]
[array([-5, -2]), array([-4.05132145, -1.05131678]), array([-3.43519754, -0.59152343]), array([-2.95893693, -0.3294394 ]), array([-2.56546289, -0.17756482]), array([-2.22920552, -0.09163256]), array([-1.93626752, -0.04494499]), array([-1.67817686, -0.02081423]), array([-1.44934985, -0.00903559]), array([-1.24588199, -0.00364591]), array([-1.06490301, -0.00135351]), array([-9.04202260e-01, -4.56444431e-04]), array([-7.61996495e-01, -1.37562928e-04]), array([-6.36778499e-01, -3.62601019e-05]), array([-5.27215237e-01, -8.11337456e-06]), array([-4.32078505e-01, -1.47473412e-06]), array([-3.50198507e-01, -2.02783991e-07]), array([-2.80434649e-01, -1.84231187e-08]), array([-2.21659834e-01, -7.67742748e-10]), array([-1.72755512e-01,  7.80451998e-12]), array([-1.32615134e-01, -5.05794800e-13]), array([-1.00153779e-01,  6.19123501e-14]), array([-7.43217708e-02, -1.13373781e-14]), array([-5.41201278e-02,  2.8

In [8]:
def train_adam(W, V, S, B, lr, constant_rate, epochs):
    results = [W]
    t = 0
    for _ in range(epochs):
        t += 1
        dW1 = 0.2*W[0]
        dW2 = 4*W[1]
        dW = np.array([dW1, dW2])
        V = B[0]*V + (1-B[0])*dW
        S = B[1]*S + (1-B[1])*(dW**2)
        Vcorr = V / (1 - B[0]**t)
        Scorr = S / (1 - B[1]**t)
        grad = Vcorr / np.sqrt(Scorr) + constant_rate
        W = W - lr*grad
        results.append(W.copy())
    return results
        

In [9]:
# Question 13: [-4.60025478, -1.60082485] -> A
W = np.array([-5, -2])
V = np.array([0, 0])
S = np.array([0, 0])
B = np.array([0.9, 0.999])
lr = 0.2
constant_rate = 0.000001
results = train_adam(W, V, S, B, lr=lr, constant_rate=constant_rate, epochs=2)
print(results)
# Question 14: [-0.11386432  0.06793465] -> C
results = train_adam(W, V, S, B, lr=lr, constant_rate=constant_rate, epochs=30)
print(results)
print(results[-1])

[array([-5, -2]), array([-4.8000002, -1.8000002]), array([-4.60025478, -1.60082485])]
[array([-5, -2]), array([-4.8000002, -1.8000002]), array([-4.60025478, -1.60082485]), array([-4.40094847, -1.40317314]), array([-4.20227761, -1.2078789 ]), array([-4.00445029, -1.01592829]), array([-3.80768632, -0.82847406]), array([-3.61221725, -0.64684272]), array([-3.41828613, -0.47252891]), array([-3.22614727, -0.30717071]), array([-3.03606576, -0.15250001]), array([-2.84831687, -0.01026478]), array([-2.66318522,  0.11787396]), array([-2.48096375,  0.23046003]), array([-2.30195251,  0.32635713]), array([-2.12645711,  0.40484041]), array([-1.95478698,  0.46564812]), array([-1.78725332,  0.50898657]), array([-1.62416686,  0.53549307]), array([-1.46583523,  0.54617016]), array([-1.31256023,  0.54230693]), array([-1.1646348 ,  0.52540095]), array([-1.02233989,  0.49708956]), array([-0.88594116,  0.45909414]), array([-0.75568571,  0.41317691]), array([-0.63179874,  0.36110806]), array([-0.51448046,  0.

### Vanishing Problem

In [10]:
import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision
from torchvision.datasets import FashionMNIST
import torchvision.transforms as transforms
import numpy as np
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(42)

<torch._C.Generator at 0x1097c1f70>

In [11]:
batch_size = 512
num_epochs = 300
lr = 0.01

train_dataset = FashionMNIST(
    root='./data', train=True,
    download=True, transform=transforms.ToTensor()
)
train_loader = DataLoader(train_dataset, batch_size, shuffle=True)
test_dataset = FashionMNIST(
    root='./data', train=False,
    download=True, transform=transforms.ToTensor()
)
test_loader = DataLoader(test_dataset, batch_size)

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 26421880/26421880 [00:09<00:00, 2650884.79it/s]


Extracting ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 29515/29515 [00:00<00:00, 92194.00it/s]


Extracting ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 4422102/4422102 [00:03<00:00, 1328134.41it/s]


Extracting ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 5148/5148 [00:00<00:00, 2248961.25it/s]

Extracting ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw






In [12]:
class MLP(nn.Module):
    def __init__(self, input_dims, hidden_dims, output_dims):
        super(MLP, self).__init__()
        self.layer1 = nn.Linear(input_dims, hidden_dims)
        self.layer2 = nn.Linear(hidden_dims, hidden_dims)
        self.layer3 = nn.Linear(hidden_dims, hidden_dims)
        self.layer4 = nn.Linear(hidden_dims, hidden_dims)
        self.layer5 = nn.Linear(hidden_dims, hidden_dims)
        self.output = nn.Linear(hidden_dims, output_dims)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = nn.Flatten()(x)
        x = self.layer1(x)
        x = self.sigmoid(x)
        x = self.layer2(x)
        x = self.sigmoid(x)
        x = self.layer3(x)
        x = self.sigmoid(x)
        x = self.layer4(x)
        x = self.sigmoid(x)
        x = self.layer5(x)
        x = self.sigmoid(x)
        out = self.output(x)
        return out

In [13]:
model = MLP(input_dims=784, hidden_dims=128, output_dims=10).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr)

In [14]:
train_losses = []
train_acc = []
val_losses = []
val_acc = []
for epoch in range(num_epochs):
    model.train()
    t_loss = 0
    t_acc = 0
    cnt = 0
    for X, y in train_loader:
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()
        t_loss += loss.item()
        t_acc += (torch.argmax(outputs, 1) == y).sum().item()
        cnt += len(y)
    t_loss /= len(train_loader)
    train_losses.append(t_loss)
    t_acc /= cnt
    train_acc.append(t_acc)
    
    model.eval()
    v_loss = 0
    v_acc = 0
    cnt = 0
    with torch.no_grad():
        for X, y in test_loader:
            X, y = X.to(device), y.to(device)
            outputs = model(X)
            loss = criterion(outputs, y)
            v_loss += loss.item()
            v_acc += (torch.argmax(outputs, 1) == y).sum().item()
            cnt += len(y)
    v_loss /= len(test_loader)
    val_losses.append(v_loss)
    v_acc /= cnt
    val_acc.append(v_acc)
    print(f"Epoch {epoch+1}/{num_epochs}, Train_Loss: {t_loss:.4f}, Train_Acc: {t_acc:.4f}, Validation Loss: {v_loss:.4f}, Val_Acc: {v_acc:.4f}")

Epoch 1/300, Train_Loss: 2.3096, Train_Acc: 0.0999, Validation Loss: 2.3026, Val_Acc: 0.1000
Epoch 2/300, Train_Loss: 2.3028, Train_Acc: 0.0981, Validation Loss: 2.3026, Val_Acc: 0.1000
Epoch 3/300, Train_Loss: 2.3028, Train_Acc: 0.0995, Validation Loss: 2.3027, Val_Acc: 0.1000
Epoch 4/300, Train_Loss: 2.3028, Train_Acc: 0.0978, Validation Loss: 2.3026, Val_Acc: 0.1000
Epoch 5/300, Train_Loss: 2.3028, Train_Acc: 0.0994, Validation Loss: 2.3027, Val_Acc: 0.1000
Epoch 6/300, Train_Loss: 2.3028, Train_Acc: 0.0990, Validation Loss: 2.3026, Val_Acc: 0.1000
Epoch 7/300, Train_Loss: 2.3028, Train_Acc: 0.0980, Validation Loss: 2.3026, Val_Acc: 0.1000
Epoch 8/300, Train_Loss: 2.3028, Train_Acc: 0.0989, Validation Loss: 2.3027, Val_Acc: 0.1000
Epoch 9/300, Train_Loss: 2.3027, Train_Acc: 0.1003, Validation Loss: 2.3027, Val_Acc: 0.1000
Epoch 10/300, Train_Loss: 2.3028, Train_Acc: 0.0997, Validation Loss: 2.3027, Val_Acc: 0.1000
Epoch 11/300, Train_Loss: 2.3028, Train_Acc: 0.0979, Validation Loss: