In [1]:
import numpy as np

In [12]:
def f(w1, w2):
    return 0.1*w1**2 + 2*w2**2

print(f(3, 4))


32.9


In [15]:
def df_w(W):
    """
    Compute the gradient dw1 and dw2 of f(w1, w2)
    Args:
        W: np.ndarray, [w1, w2]
    Returns:
        dW: np.ndarray, [dw1, dw2], array containing the gradients
    """
    dw1 = 0.2*W[0]
    dw2 = 4*W[1]
    return np.array([dw1, dw2])



### Gradient Descent for Function Minimization

- $ f(w_1, w_2) = 0.1w_1^2 + 2w_2^2 $
- **Giá trị ban đầu:** $ w_1 = -5 $, $ w_2 = -2 $
- **Learning rate:** $ \alpha = 0.4 $
- **Epochs:** 2

Tính Gradient

- $ \frac{\partial f}{\partial w_1} = 0.2w_1 $
- $ \frac{\partial f}{\partial w_2} = 4w_2 $

Epoch 1

Step 1: Tính Gradient tại $ w_1 = -5 $, $ w_2 = -2 $

- $ \frac{\partial f}{\partial w_1} = 0.2 \times -5 = -1 $
- $ \frac{\partial f}{\partial w_2} = 4 \times -2 = -8 $

Step 2: Cập nhật trọng số bằng Gradient Descent

- $ w_1 = -5 - 0.4 \times -1 = -5 + 0.4 = -4.6 $
- $ w_2 = -2 - 0.4 \times -8 = -2 + 3.2 = 1.2 $

Epoch 2

Step 1: Tính Gradient tại $ w_1 = -4.6 $, $ w_2 = 1.2 $

- $ \frac{\partial f}{\partial w_1} = 0.2 \times -4.6 = -0.92 $
- $ \frac{\partial f}{\partial w_2} = 4 \times 1.2 = 4.8 $

Step 2: Cập nhật trọng số bằng Gradient Descent

- $ w_1 = -4.6 - 0.4 \times -0.92 = -4.6 + 0.368 = -4.232 $
- $ w_2 = 1.2 - 0.4 \times 4.8 = 1.2 - 1.92 = -0.72 $

Vậy

- $ w_1 \approx -4.232 $
- $ w_2 \approx -0.72 $


In [23]:
# Exercise 1: Gradient descent

def sgd(W, dW, lr):
    """
    Update the weights W using the gradient descent update rule
    Args:
        W: np.ndarray, [w1, w2]
        dW: np.ndarray, [dw1, dw2]
        lr: float, learning rate
    Returns:
        W: np.ndarray, [w1, w2]
    """
    W = W - lr*dW
    return W
def train_pl(optimizer, lr, epochs):
    """
    Find the minimum of f(w1, w2) using the optimizer
    Args:
        optimizer: function, the optimizer to use
        lr: float, learning rate
        epochs: int, number of epochs
    Returns:
        result: list: list of pairs [w1, w2] after each epoch
    """
    # initial point
    W = np.array([-5, -2], dtype=np.float32)

    # list of results
    result = [W]

    # loop over epochs
    for _ in range(epochs):
        dW = df_w(W)
        W = optimizer(W, dW, lr)
        result.append(W)
    return result

print(train_pl(sgd, 0.4, 30))


[array([-5., -2.], dtype=float32), array([-4.6,  1.2]), array([-4.232, -0.72 ]), array([-3.89344,  0.432  ]), array([-3.5819648, -0.2592   ]), array([-3.29540762,  0.15552   ]), array([-3.03177501, -0.093312  ]), array([-2.78923301,  0.0559872 ]), array([-2.56609437, -0.03359232]), array([-2.36080682,  0.02015539]), array([-2.17194227, -0.01209324]), array([-1.99818689,  0.00725594]), array([-1.83833194, -0.00435356]), array([-1.69126538,  0.00261214]), array([-1.55596415, -0.00156728]), array([-1.43148702e+00,  9.40369969e-04]), array([-1.31696806e+00, -5.64221981e-04]), array([-1.21161061e+00,  3.38533189e-04]), array([-1.11468176e+00, -2.03119913e-04]), array([-1.02550722e+00,  1.21871948e-04]), array([-9.43466646e-01, -7.31231688e-05]), array([-8.67989314e-01,  4.38739013e-05]), array([-7.98550169e-01, -2.63243408e-05]), array([-7.34666155e-01,  1.57946045e-05]), array([-6.75892863e-01, -9.47676268e-06]), array([-6.21821434e-01,  5.68605761e-06]), array([-5.72075719e-01, -3.4116345

### Gradient Descent with Momentum for Function Minimization

- $ f(w_1, w_2) = 0.1w_1^2 + 2w_2^2 $
- **Giá trị ban đầu:** $ w_1 = -5 $, $ w_2 = -2 $
- **Learning rate:** $ \alpha = 0.6 $
- **Beta:** $ \beta = 0.5 $
- **Vận tốc ban đầu:** $ V_t = [v_1, v_2] = [0, 0] $
- **Epochs:** 2

Tính Gradient

- $ \frac{\partial f}{\partial w_1} = 0.2w_1 $
- $ \frac{\partial f}{\partial w_2} = 4w_2 $

Epoch 1

Step 1: Tính Gradient tại $ w_1 = -5 $, $ w_2 = -2 $

- $ \frac{\partial f}{\partial w_1} = 0.2 \times -5 = -1 $
- $ \frac{\partial f}{\partial w_2} = 4 \times -2 = -8 $

Step 2: Tính Vận tốc $ V_t $ bằng Momentum

- $ V_t = \beta V_{t-1} + (1 - \beta) \times dW_t $
- $ V_1 = 0.5 \times [0, 0] + (1 - 0.5) \times [-1, -8] $
- $ V_1 = [0, 0] + [0.5 \times -1, 0.5 \times -8] $
- $ V_1 = [-0.5, -4] $

Step 3: Cập nhật trọng số bằng Gradient Descent with Momentum

- $ W_t = W_{t-1} - \alpha \times V_t $
- $ w_1 = -5 - 0.6 \times -0.5 = -5 + 0.3 = -4.7 $
- $ w_2 = -2 - 0.6 \times -4 = -2 + 2.4 = 0.4 $

Epoch 2

Step 1: Tính Gradient tại $ w_1 = -4.7 $, $ w_2 = 0.4 $

- $ \frac{\partial f}{\partial w_1} = 0.2 \times -4.7 = -0.94 $
- $ \frac{\partial f}{\partial w_2} = 4 \times 0.4 = 1.6 $

Step 2: Tính Vận tốc $ V_t $ bằng Momentum

- $ V_t = \beta V_{t-1} + (1 - \beta) \times dW_t $
- $ V_2 = 0.5 \times [-0.5, -4] + (1 - 0.5) \times [-0.94, 1.6] $
- $ V_2 = [-0.25, -2] + [0.5 \times -0.94, 0.5 \times 1.6] $
- $ V_2 = [-0.25, -2] + [-0.47, 0.8] $
- $ V_2 = [-0.72, -1.2] $

Step 3: Cập nhật trọng số bằng Gradient Descent with Momentum

- $ W_t = W_{t-1} - \alpha \times V_t $
- $ w_1 = -4.7 - 0.6 \times -0.72 = -4.7 + 0.432 = -4.268 $
- $ w_2 = 0.4 - 0.6 \times -1.2 = 0.4 + 0.72 = 1.12 $


Kết quả:

- $ w_1 \approx -4.268 $
- $ w_2 \approx 1.12 $



In [25]:
def sgd_momentum(W, V, dW, lr, gamma=0.5):
    """
    Update the weights W using the gradient descent with momentum update rule
    Args:
        W: np.ndarray, [w1, w2]
        V: np.ndarray, [v1, v2]
        dW: np.ndarray, [dw1, dw2]
        lr: float, learning rate
        gamma: float, momentum
    Returns:
        V: np.ndarray, [v1, v2]
        W: np.ndarray, [w1, w2]
    """
    V = gamma*V + (1-gamma)*dW
    W = W - lr*V
    return W, V

def train_pl(optimizer, lr, epochs):
    """
    Find the minimum of f(w1, w2) using the optimizer
    Args:
        optimizer: function, the optimizer to use
        lr: float, learning rate
        epochs: int, number of epochs
    Returns:
        result: list: list of pairs [w1, w2] after each epoch
    """
    # initial point
    W = np.array([-5, -2], dtype=np.float32)
    V = np.array([0, 0], dtype=np.float32)
    # list of results
    result = [W]

    # loop over epochs
    for _ in range(epochs):
        dW = df_w(W)
        W, V = optimizer(W, V, dW, lr)
        result.append(W)
    return result

print(train_pl(sgd_momentum, 0.6, 30))



[array([-5., -2.], dtype=float32), array([-4.7,  0.4]), array([-4.268,  1.12 ]), array([-3.79592,  0.136  ]), array([-3.3321248, -0.5192   ]), array([-2.90029971, -0.22376   ]), array([-2.51036919,  0.192472  ]), array([-2.16478177,  0.1696216 ]), array([-1.86210116, -0.04534952]), array([-1.59903478, -0.09841566]), array([-1.37155951, -0.00684994]), array([-1.1755283 ,  0.04715285]), array([-1.006981  ,  0.01757082]), array([-0.86228849, -0.01830518]), array([-0.73820492, -0.01427696]), array([-0.63187084,  0.0048695 ]), array([-0.54079155,  0.00859933]), array([-4.62804416e-01,  1.45050014e-04]), array([-0.39604258, -0.00425615]), array([-0.33889911, -0.00134937]), array([-0.28999343,  0.00172326]), array([-0.24814098,  0.00119166]), array([-0.2123263 , -0.00050413]), array([-0.18167938, -0.00074707]), array([-1.55455157e-01,  2.79448010e-05]), array([-0.13301574,  0.00038192]), array([-1.13815082e-01,  1.00603444e-04]), array([-0.09738585, -0.00016078]), array([-8.33280829e-02, -9.8

### RMSProp Optimization for Function Minimization


- $ f(w_1, w_2) = 0.1w_1^2 + 2w_2^2 $
- **Giá trị khởi đầu:** $ w_1 = -5 $, $ w_2 = -2 $
- **Giá trị khởi đầu của squared gradients:** $ s_1 = 0 $, $ s_2 = 0 $
- **Learning rate:** $ \alpha = 0.3 $
- **Decay rate:** $ \gamma = 0.9 $
- **Epsilon:** $ \epsilon = 10^{-6} $
- **Epochs:** 2

Tính Gradient

- $ \frac{\partial f}{\partial w_1} = 0.2w_1 $
- $ \frac{\partial f}{\partial w_2} = 4w_2 $

Epoch 1

Step 1: Tính Gradient tại $ w_1 = -5 $, $ w_2 = -2 $

- $ \frac{\partial f}{\partial w_1} = 0.2 \times -5 = -1 $
- $ \frac{\partial f}{\partial w_2} = 4 \times -2 = -8 $

Step 2: Cập nhật squared gradients $ s_1 $ và $ s_2 $

- $ s_1 = \gamma s_1 + (1 - \gamma) \times (\frac{\partial f}{\partial w_1})^2 $
- $ s_1 = 0.9 \times 0 + 0.1 \times (-1)^2 = 0.1 $
- $ s_2 = \gamma s_2 + (1 - \gamma) \times (\frac{\partial f}{\partial w_2})^2 $
- $ s_2 = 0.9 \times 0 + 0.1 \times (-8)^2 = 6.4 $

Step 3: Cập nhật trọng số bằng RMSProp

- $ w_1 = w_1 - \alpha \times \frac{\frac{\partial f}{\partial w_1}}{\sqrt{s_1 + \epsilon}} $
- $ w_1 = -5 - 0.3 \times \frac{-1}{\sqrt{0.1 + 10^{-6}}} \approx -4.0513 $
- $ w_2 = w_2 - \alpha \times \frac{\frac{\partial f}{\partial w_2}}{\sqrt{s_2 + \epsilon}} $
- $ w_2 = -2 - 0.3 \times \frac{-8}{\sqrt{6.4 + 10^{-6}}} \approx -1.0513 $

Epoch 2

Step 1: Tính Gradient tại $ w_1 = -4.0513 $, $ w_2 = -1.0513 $

- $ \frac{\partial f}{\partial w_1} = 0.2 \times -4.0513 \approx -0.8103 $
- $ \frac{\partial f}{\partial w_2} = 4 \times -1.0513 \approx -4.2053 $

Step 2: Cập nhật squared gradients $ s_1 $ và $ s_2 $

- $ s_1 = \gamma s_1 + (1 - \gamma) \times (\frac{\partial f}{\partial w_1})^2 $
- $ s_1 = 0.9 \times 0.1 + 0.1 \times (-0.8103)^2 \approx 0.1556 $
- $ s_2 = \gamma s_2 + (1 - \gamma) \times (\frac{\partial f}{\partial w_2})^2 $
- $ s_2 = 0.9 \times 6.4 + 0.1 \times (-4.2053)^2 \approx 7.5284 $

Step 3: Cập nhật trọng số bằng RMSProp

- $ w_1 = w_1 - \alpha \times \frac{\frac{\partial f}{\partial w_1}}{\sqrt{s_1 + \epsilon}} $
- $ w_1 = -4.0513 - 0.3 \times \frac{-0.8103}{\sqrt{0.1556 + 10^{-6}}} \approx -3.4351 $
- $ w_2 = w_2 - \alpha \times \frac{\frac{\partial f}{\partial w_2}}{\sqrt{s_2 + \epsilon}} $
- $ w_2 = -1.0513 - 0.3 \times \frac{-4.2053}{\sqrt{7.5284 + 10^{-6}}} \approx -0.5920 $

Kết quả:

- $ w_1 \approx -3.4351 $
- $ w_2 \approx -0.5920 $

In [29]:
def RMSprop(W, S, dW, lr, gamma=0.9, epsilon=1e-6):
    """
    Update the weights W using RMSprop
    Args:
        W: np.ndarray, [w1, w2]
        S: np.ndarray, [s1, s2]
        dW: np.ndarray, [dw1, dw2]
        lr: float, learning rate
        gamma: float, decay rate 
        epsilon: float
    Returns:
        S: np.ndarray, [s1, s2],
        W: np.ndarray, [w1, w2], updated weights
    """
    S = gamma*S + (1- gamma)*dW**2
    W = W - lr* dW/(np.sqrt(S + epsilon))
    return W, S


def train_pl(optimizer, lr, epochs):
    """
    Find the minimum of f(w1, w2) using the optimizer
    Args:
        optimizer: function, the optimizer to use
        lr: float, learning rate
        epochs: int, number of epochs
    Returns:
        result: list: list of pairs [w1, w2] after each epoch
    """
    # initial point
    W = np.array([-5, -2], dtype=np.float32)
    S = np.array([0, 0], dtype=np.float32)
    # list of results
    result = [W]

    # loop over epochs
    for _ in range(epochs):
        dW = df_w(W)
        W, S = optimizer(W, S, dW, lr)
        result.append(W)
    return result

print(train_pl(RMSprop, 0.3, 30))


[array([-5., -2.], dtype=float32), array([-4.05132145, -1.05131678]), array([-3.43519754, -0.59152343]), array([-2.95893693, -0.3294394 ]), array([-2.56546289, -0.17756482]), array([-2.22920552, -0.09163256]), array([-1.93626752, -0.04494499]), array([-1.67817686, -0.02081423]), array([-1.44934985, -0.00903559]), array([-1.24588199, -0.00364591]), array([-1.06490301, -0.00135351]), array([-9.04202260e-01, -4.56444431e-04]), array([-7.61996495e-01, -1.37562928e-04]), array([-6.36778499e-01, -3.62601019e-05]), array([-5.27215237e-01, -8.11337456e-06]), array([-4.32078505e-01, -1.47473412e-06]), array([-3.50198507e-01, -2.02783991e-07]), array([-2.80434649e-01, -1.84231187e-08]), array([-2.21659834e-01, -7.67742748e-10]), array([-1.72755512e-01,  7.80451998e-12]), array([-1.32615134e-01, -5.05794800e-13]), array([-1.00153779e-01,  6.19123501e-14]), array([-7.43217708e-02, -1.13373781e-14]), array([-5.41201278e-02,  2.80166702e-15]), array([-3.86159157e-02, -8.81341191e-16]), array([-2.695

In [31]:
def adam(W, V, S, dW, lr, t, beta1=0.9, beta2=0.999, epsilon=1e-6):
    """
    Update the weights W using the Adam optimizer
    Args:
        W: np.ndarray, [w1, w2]
        V: np.ndarray, [v1, v2]
        S: np.ndarray, [s1, s2]
        dW: np.ndarray, [dw1, dw2]
        lr: float, learning rate
        t: int, time step
        beta1: float, decay rate for the first moment
        beta2: float, decay rate for the second moment
        epsilon: float, small constant to prevent division by zero
    Returns:
        W: np.ndarray, [w1, w2], updated weights
        V: np.ndarray, [v1, v2], updated first moment
        S: np.ndarray, [s1, s2], updated second moment
    """
    V = beta1 * V + (1 - beta1) * dW
    S = beta2 * S + (1 - beta2) * (dW ** 2)
    V_corr = V / (1 - beta1 ** (t+1))
    S_corr = S / (1 - beta2 ** (t+1))
    W = W - lr * V_corr / (np.sqrt(S_corr) + epsilon)
    return W, V, S

def train_pl(optimizer, lr, epochs):
    W = np.array([-5, -2], dtype=np.float32)
    V = np.array([0, 0], dtype=np.float32)
    S = np.array([0, 0], dtype=np.float32)
    result = [W]

    for t in range(epochs):
        dW = df_w(W)
        W, V, S = optimizer(W, V, S, dW, lr, t)
        result.append(W)
    return result

print(train_pl(adam, 0.2, 30))


[array([-5., -2.], dtype=float32), array([-4.8000002 , -1.80000002]), array([-4.60025478, -1.60082451]), array([-4.40094848, -1.40317262]), array([-4.20227764, -1.20787822]), array([-4.00445033, -1.01592745]), array([-3.80768638, -0.82847307]), array([-3.61221732, -0.64684159]), array([-3.41828623, -0.47252765]), array([-3.22614739, -0.30716934]), array([-3.03606592, -0.15249855]), array([-2.84831706, -0.01026326]), array([-2.66318543,  0.11787552]), array([-2.480964  ,  0.23046161]), array([-2.30195279,  0.3263587 ]), array([-2.12645742,  0.40484195]), array([-1.95478732,  0.46564961]), array([-1.7872537 ,  0.50898799]), array([-1.62416726,  0.53549442]), array([-1.46583566,  0.54617144]), array([-1.31256067,  0.54230812]), array([-1.16463526,  0.52540206]), array([-1.02234036,  0.4970906 ]), array([-0.88594163,  0.4590951 ]), array([-0.75568617,  0.41317781]), array([-0.63179919,  0.3611089 ]), array([-0.51448089,  0.30464048]), array([-0.40390346,  0.24548409]), array([-0.30020842, 