## Dataset Preparation

In [103]:
import numpy as np

X = np.array([1, 2, 3, 4]) # shape=(4, )
y = np.array([3, 5, 7, 9]) # y=2x+1


## 1. Gradient Descent

### 1.1 Batch Gradient Descent

In [104]:
def predict(x, w, b):
    return w*x+b

def gradients(x, y, w, b):
    y_pred = predict(x, w, b)
    dw = (y_pred-y)*x
    db = (y_pred-y)
    return dw, db

def mse_loss(x, y, w, b):
    prediction = predict(x, w, b)
    return ((prediction-y)**2).mean()

In [105]:
def batch_gd(X, y, lr=0.1, epochs=50):
    w, b = 0.0, 0.0
    N = len(X) # the number of dataset

    for ep in range(epochs):
        dw_sum, db_sum = 0.0, 0.0
        for i in range(N): # iterate all data points in the dataset
            dw, db = gradients(X[i], y[i], w, b)
            dw_sum+=dw
            db_sum+=db

        # average gradient
        w-=lr*dw_sum/N
        b-=lr*db_sum/N
        mse = mse_loss(X, y, w, b)
        print(f"Epoch {ep}, w: {w}, b: {b}, MSE Loss: {mse}")
    return w, b

batch_gd(X, y)

Epoch 0, w: 1.75, b: 0.6000000000000001, MSE Loss: 1.1287500000000006
Epoch 1, w: 2.0375, b: 0.7025000000000001, MSE Loss: 0.0432718749999998
Epoch 2, w: 2.0837499999999998, b: 0.722875, MSE Loss: 0.013357640625000014
Epoch 3, w: 2.09021875, b: 0.7296500000000001, MSE Loss: 0.01218159857421868
Epoch 4, w: 2.0901421875, b: 0.7341303125000002, MSE Loss: 0.011798419380029259
Epoch 5, w: 2.08900296875, b: 0.7381817343750001, MSE Loss: 0.011447252994228812
Epoch 6, w: 2.08770530859375, b: 0.7421128187500001, MSE Loss: 0.011107082849989253
Epoch 7, w: 2.0863981224609374, b: 0.7459752097265626, MSE Loss: 0.010777036118485434
Epoch 8, w: 2.085105728183594, b: 0.749778158138672, MSE Loss: 0.010456797121681672
Epoch 9, w: 2.0838318925112307, b: 0.7535239102789063, MSE Loss: 0.010146074018873748
Epoch 10, w: 2.0825769955580813, b: 0.757213546123208, MSE Loss: 0.009844584034832125
Epoch 11, w: 2.0813408623587186, b: 0.7608479426213669, MSE Loss: 0.00955205280768426
Epoch 12, w: 2.0801232299343377,

(np.float64(2.0458560156699397), np.float64(0.8651777219412482))

### 1.2 Mini-Batch SGD

In [106]:
def mini_batch_sgd(X, y, lr=0.1, batch_size =2, epochs=50):
    w = 0.0
    b = 0.0
    N = len(X)

    for ep in range(epochs):
        # shuffle
        indices = np.random.permutation(N)
        X_shuffled = X[indices]
        y_shuffled = y[indices]

        # 从shuffle后的数据中获取batch，并且遍历这个batch总共epochs次
        for start in range(0, N, batch_size): # get the batch-size dataset
            end = start+batch_size
            dw_sum, db_sum = 0, 0
            batch_X = X_shuffled[start:end+1]
            batch_y = y_shuffled[start:end+1]
            B = len(batch_X)

            for i in range(B): # iterate all samples in this mini-batch
                dw, db = gradients(batch_X[i], batch_y[i], w, b) # 如果不设置i的话 会导致计算gradients的时候维度不一致
                dw_sum+=dw
                db_sum+=db
            
            # 每次得到一个batch就更新一次参数
            w-=lr*dw_sum/B
            b-=lr*db_sum/B
        
        mse = mse_loss(batch_X, batch_y, w, b)
        print(f"Epoch: {ep}, w: {w}, b: {b}, MSE Loss: {mse}")
    return w, b

mini_batch_sgd(X, y)

Epoch: 0, w: 2.15, b: 0.81, MSE Loss: 0.09009999999999999
Epoch: 1, w: 2.06795, b: 0.7896000000000001, MSE Loss: 0.012030981250000012
Epoch: 2, w: 2.079997, b: 0.803493, MSE Loss: 0.007453889634499964
Epoch: 3, w: 2.0763423100000002, b: 0.80998734, MSE Loss: 0.007157165519681978
Epoch: 4, w: 2.0685570882333333, b: 0.8135676133666666, MSE Loss: 0.007132360201076544
Epoch: 5, w: 2.055821311449667, b: 0.8148791911909999, MSE Loss: 0.003427784027656878
Epoch: 6, w: 2.065897364531771, b: 0.8263414196955465, MSE Loss: 0.006095044692954273
Epoch: 7, w: 2.052159605844744, b: 0.827970723302549, MSE Loss: 0.007854483805542864
Epoch: 8, w: 2.06127504714186, b: 0.8357557570686279, MSE Loss: 0.001060906808646481
Epoch: 9, w: 2.0492479493628193, b: 0.8366656105913581, MSE Loss: 0.0026684254950992145
Epoch: 10, w: 2.048040121089359, b: 0.8408429676047194, MSE Loss: 0.002533954652721304
Epoch: 11, w: 2.060659340138556, b: 0.8507097040024615, MSE Loss: 0.004318928811244091
Epoch: 12, w: 2.0539163470898

(np.float64(2.014554061674171), np.float64(0.9520065591498098))

### 1.3 SGD

In [107]:
def sgd(X, y, lr=0.1, epochs=50):
    w, b = 0, 0
    N = len(X)

    for ep in range(epochs):
        for i in range(N):
            dw, db = gradients(X[i], y[i], w, b)
            # 每次得到一个样本就更新一次参数
            w-=lr*dw
            b-=lr*db
        mse = mse_loss(X, y, w, b)
            
        print(f"Epoch: {ep}, w: {w}, b: {b}, MSE: {mse}")
    return w, b

sgd(X, y)

Epoch: 0, w: 1.9993999999999998, b: 1.0030999999999999, MSE: 3.0099999999977384e-06
Epoch: 1, w: 1.9994429000000002, b: 1.00287835, MSE: 2.59495787250237e-06
Epoch: 2, w: 1.9994827326500002, b: 1.0026725479749998, MSE: 2.2371449701169786e-06
Epoch: 3, w: 1.9995197172655248, b: 1.0024814607947872, MSE: 1.928670083761853e-06
Epoch: 4, w: 1.99955405748104, b: 1.00230403634796, MSE: 1.6627301054205377e-06
Epoch: 5, w: 1.9995859423711462, b: 1.002139297749081, MSE: 1.4334599923297457e-06
Epoch: 6, w: 1.999615547491609, b: 1.0019863379600216, MSE: 1.2358034192714807e-06
Epoch: 7, w: 1.999643035845959, b: 1.0018443147958798, MSE: 1.0654012663448599e-06
Epoch: 8, w: 1.9996685587829728, b: 1.0017124462879745, MSE: 9.184954828815488e-07
Epoch: 9, w: 1.9996922568299904, b: 1.0015900063783845, MSE: 7.91846207362491e-07
Epoch: 10, w: 1.9997142604666456, b: 1.0014763209223299, MSE: 6.826603154812293e-07
Epoch: 11, w: 1.9997346908432805, b: 1.0013707639763834, MSE: 5.885298205654958e-07
Epoch: 12, w:

(np.float64(1.9999841705893395), np.float64(1.000081785288414))

## 2. Momentum

In [108]:
def sgd_momentum(X, y, momentum=0.5, lr=0.1, epochs=50):
    # momentum如果设置为0.9，会导致MSE特别大，这是因为step过大，导致直接跳过了最优点
    N = len(X)
    w, b = 0, 0
    vw, vb = 0, 0
    for ep in range(epochs):
        indices = np.random.permutation(N)
        for i in indices:
            dw, db = gradients(X[i], y[i], w, b)
            
            vw = momentum*vw+dw
            vb = momentum*vb+db
            w-= lr*vw
            b-= lr*vb
        
        mse = mse_loss(X, y, w, b)
        print(f"Eppch: {ep}, w: {w}, b: {b}, MSE: {mse}")
    return w, b

sgd_momentum(X, y)


Eppch: 0, w: 2.616, b: 0.6545, MSE: 1.901150250000002
Eppch: 1, w: 2.5293655, b: 0.39166649999999975, MSE: 0.8616245546778739
Eppch: 2, w: 2.1460053975, b: 0.2993698784999997, MSE: 0.13928549094619824
Eppch: 3, w: 2.34491951619745, b: 0.42284159272909977, MSE: 0.23001687896175618
Eppch: 4, w: 2.0409837823315695, b: 0.35597821401563573, MSE: 0.295389345461142
Eppch: 5, w: 2.238544573541148, b: 0.47892143983552443, MSE: 0.07679690302819285
Eppch: 6, w: 2.1976380292204225, b: 0.5918000912654926, MSE: 0.05620396749563249
Eppch: 7, w: 1.9831547931891131, b: 0.598844702468076, MSE: 0.19684149993272876
Eppch: 8, w: 2.0053174099919993, b: 0.6894489377854326, MSE: 0.0883973870040401
Eppch: 9, w: 2.0954986220372134, b: 0.7373152937934615, MSE: 0.011973018592491143
Eppch: 10, w: 2.0503988705027596, b: 0.7547246246040356, MSE: 0.017402346474800948
Eppch: 11, w: 2.054217367584534, b: 0.7882884221210906, MSE: 0.009475992117638495
Eppch: 12, w: 2.0329876696719027, b: 0.8247120898639602, MSE: 0.009975

(np.float64(2.0003929624891765), np.float64(0.9991384058079079))

## 3. Adagrad

In [109]:
def sgd_adagrad(X, y, epochs = 50, lr = 0.1, eps = 1e-6):
    w, b = 0, 0
    N = len(X)
    Gw, Gb = 0, 0
    for ep in range(epochs):
        shuffle_N = np.random.permutation(N)
        for i in shuffle_N: # shuffle
            dw, db = gradients(X[i], y[i], w, b)

            Gw+=dw**2
            Gb+=db**2
            
            w-=lr*dw/(np.sqrt(Gw)+eps)
            b-=lr*db/(np.sqrt(Gb)+eps)

        mse = mse_loss(X, y, w, b)
        print(f"Epoch: {ep}, w: {w}, b: {b}, MSE: {mse}")
    return w, b

sgd_adagrad(X, y)

Epoch: 0, w: 0.29724298966890456, b: 0.29125935735176256, MSE: 28.281739559158375
Epoch: 1, w: 0.4191319854548814, b: 0.4299333488559187, MSE: 23.574554257097965
Epoch: 2, w: 0.5079242566673225, b: 0.5310255549788171, MSE: 20.415839177191195
Epoch: 3, w: 0.5805147686455517, b: 0.6138992781496275, MSE: 18.001432545073293
Epoch: 4, w: 0.6442059639026916, b: 0.6863193086716263, MSE: 16.011158641199415
Epoch: 5, w: 0.6998456820383827, b: 0.749811882857264, MSE: 14.36701927684529
Epoch: 6, w: 0.7498196095404815, b: 0.8064159327394459, MSE: 12.9696823802412
Epoch: 7, w: 0.7956566909780546, b: 0.8582635072293913, MSE: 11.751907261839982
Epoch: 8, w: 0.8376264393416104, b: 0.9058367711821765, MSE: 10.68947316036419
Epoch: 9, w: 0.8763276011387515, b: 0.9494365458710119, MSE: 9.756437901592054
Epoch: 10, w: 0.9125194065328224, b: 0.9902665128541306, MSE: 8.92262494141943
Epoch: 11, w: 0.9461816934465314, b: 1.0280629463987832, MSE: 8.181918969910402
Epoch: 12, w: 0.9779132972473509, b: 1.063918

(np.float64(1.5207133900787846), np.float64(1.6482960196710257))

## 4. RMSProp

In [110]:
def sgd_rmsprop(X, y, lr=0.1, epochs=50, eps=1e-6, weight_decay=0.9):
    w, b = 0, 0
    sw, sb = 0, 0
    N = len(X)

    for ep in range(epochs):
        shuffle_N = np.random.permutation(N)
        for i in shuffle_N:
            dw, db = gradients(X[i], y[i], w, b)

            sw = weight_decay*sw+(1-weight_decay)*dw**2
            sb = weight_decay*sb+(1-weight_decay)*db**2

            w-=lr*dw/(np.sqrt(sw)+eps)
            b-=lr*db/(np.sqrt(sb)+eps)

        mse = mse_loss(X, y, w, b)
        print(f"Epoch: {ep}, w: {w}, b: {b}, MSE: {mse}")
    return w, b

sgd_rmsprop(X, y)


Epoch: 0, w: 0.5438788911165449, b: 0.6727422890395638, MSE: 18.3918970422748
Epoch: 1, w: 0.8855278422479446, b: 1.0626214424168052, MSE: 8.970333602825264
Epoch: 2, w: 1.1363763452596538, b: 1.3523233097466412, MSE: 4.196601620225827
Epoch: 3, w: 1.3226510912363478, b: 1.5562833009806258, MSE: 1.8664732580613834
Epoch: 4, w: 1.4748232067830225, b: 1.7177067848051355, MSE: 0.699068271387693
Epoch: 5, w: 1.5758486398861835, b: 1.8114567413379659, MSE: 0.28684246265071583
Epoch: 6, w: 1.631519540941174, b: 1.8379736745970474, MSE: 0.17664912315554554
Epoch: 7, w: 1.671872351927647, b: 1.839987733188713, MSE: 0.13497154612451925
Epoch: 8, w: 1.6986875191484756, b: 1.8202228548738946, MSE: 0.11796769876836871
Epoch: 9, w: 1.7068051918871459, b: 1.7661961169731424, MSE: 0.10855683848344924
Epoch: 10, w: 1.7457717636758603, b: 1.7378535169918978, MSE: 0.09125179216886346
Epoch: 11, w: 1.8035320992680355, b: 1.7263021910996617, MSE: 0.10353680901947415
Epoch: 12, w: 1.7909769533151465, b: 1.

(np.float64(2.0038962882429887), np.float64(1.0151966736847813))

## 5. Adam

In [None]:
def sgd_adam(X, y, beta1=0.9, beta2=0.999, lr=0.1, epochs=50, eps=1e-8):
    w, b = 0, 0
    N = len(X)
    mw, mb = 0, 0
    vw, vb = 0, 0
    t = 0 # time step 
    # we have to record each update
    for ep in range(epochs):
        N_shuffle = np.random.permutation(N)
        for i in N_shuffle:
            t+=1
            dw, db = gradients(X[i], y[i], w, b)

            # 一阶矩
            mw = beta1*mw+(1-beta1)*dw
            mb = beta1*mb+(1-beta1)*db

            # 二阶矩
            vw = beta2*vw+(1-beta2)*(dw**2)
            vb = beta2*vb+(1-beta2)*(db**2)

            # debias
            mw_hat = mw/(1-beta1**t)
            mb_hat = mb/(1-beta1**t)

            vw_hat = vw/(1-beta2**t)
            vb_hat = vb/(1-beta2**t)

            # update
            w-=lr*mw_hat/(np.sqrt(vw_hat)+eps)
            b-=lr*mb_hat/(np.sqrt(vb_hat)+eps)

        mse = mse_loss(X, y, w, b)

        print(f"Epoch: {ep}, w: {w}, b: {b}, MSE: {mse}")
    return w, b

sgd_adam(X, y)
            

Epoch: 0, w: 0.35933804577830264, b: 0.3921213698280145, MSE: 25.544420495786852
Epoch: 1, w: 0.690671687733026, b: 0.7673584745947546, MSE: 14.43469747823232
Epoch: 2, w: 1.0080652514845525, b: 1.1227684538717513, MSE: 6.78568970602456
Epoch: 3, w: 1.2860241029190604, b: 1.4349347053526096, MSE: 2.4597155778853366
Epoch: 4, w: 1.535654205909168, b: 1.7044457807018363, MSE: 0.4778393044528084
Epoch: 5, w: 1.7208478502635376, b: 1.9023202372865322, MSE: 0.1392030609392346
Epoch: 6, w: 1.8474762783229586, b: 2.032280310440352, MSE: 0.4528426080679757
Epoch: 7, w: 1.9150480176221738, b: 2.0918349637913862, MSE: 0.7824621599642548
Epoch: 8, w: 1.9341237544087075, b: 2.0884822312237383, MSE: 0.8588155517595741
Epoch: 9, w: 1.9155416221132784, b: 2.0391797219128023, MSE: 0.6945564581715424
Epoch: 10, w: 1.8669174763377718, b: 1.9641207382874957, MSE: 0.4208228789463793
Epoch: 11, w: 1.8132831593202015, b: 1.876602447770215, MSE: 0.21152349298840875
Epoch: 12, w: 1.762066889943268, b: 1.79251

(np.float64(1.9247916700682872), np.float64(1.2086735979394938))