## Dataset Preparation

In [29]:
import numpy as np

X = np.array([1, 2, 3, 4]) # shape=(4, )
y = np.array([3, 5, 7, 9]) # y=2x+1


## 1. Gradient Descent

### 1.1 Batch Gradient Descent

In [30]:
def predict(x, w, b):
    return w*x+b

def gradients(x, y, w, b):
    y_pred = predict(x, w, b)
    dw = (y_pred-y)*x
    db = (y_pred-y)
    return dw, db

def mse_loss(x, y, w, b):
    prediction = predict(x, w, b)
    return ((prediction-y)**2).mean()

In [31]:
def batch_gd(X, y, lr=0.1, epochs=50):
    w, b = 0.0, 0.0
    N = len(X) # the number of dataset

    for ep in range(epochs):
        dw_sum, db_sum = 0.0, 0.0
        for i in range(N): # iterate all data points in the dataset
            dw, db = gradients(X[i], y[i], w, b)
            dw_sum+=dw
            db_sum+=db

        # average gradient
        w-=lr*dw_sum/N
        b-=lr*db_sum/N
        mse = mse_loss(X, y, w, b)
        print(f"Epoch {ep}, w: {w}, b: {b}, MSE Loss: {mse}")
    return w, b

batch_gd(X, y)

Epoch 0, w: 1.75, b: 0.6000000000000001, MSE Loss: 1.1287500000000006
Epoch 1, w: 2.0375, b: 0.7025000000000001, MSE Loss: 0.0432718749999998
Epoch 2, w: 2.0837499999999998, b: 0.722875, MSE Loss: 0.013357640625000014
Epoch 3, w: 2.09021875, b: 0.7296500000000001, MSE Loss: 0.01218159857421868
Epoch 4, w: 2.0901421875, b: 0.7341303125000002, MSE Loss: 0.011798419380029259
Epoch 5, w: 2.08900296875, b: 0.7381817343750001, MSE Loss: 0.011447252994228812
Epoch 6, w: 2.08770530859375, b: 0.7421128187500001, MSE Loss: 0.011107082849989253
Epoch 7, w: 2.0863981224609374, b: 0.7459752097265626, MSE Loss: 0.010777036118485434
Epoch 8, w: 2.085105728183594, b: 0.749778158138672, MSE Loss: 0.010456797121681672
Epoch 9, w: 2.0838318925112307, b: 0.7535239102789063, MSE Loss: 0.010146074018873748
Epoch 10, w: 2.0825769955580813, b: 0.757213546123208, MSE Loss: 0.009844584034832125
Epoch 11, w: 2.0813408623587186, b: 0.7608479426213669, MSE Loss: 0.00955205280768426
Epoch 12, w: 2.0801232299343377,

(np.float64(2.0458560156699397), np.float64(0.8651777219412482))

### 1.2 Mini-Batch SGD

In [32]:
def mini_batch_sgd(X, y, lr=0.1, batch_size =2, epochs=50):
    w = 0.0
    b = 0.0
    N = len(X)

    for ep in range(epochs):
        # shuffle
        indices = np.random.permutation(N)
        X_shuffled = X[indices]
        y_shuffled = y[indices]

        # 从shuffle后的数据中获取batch，并且遍历这个batch总共epochs次
        for start in range(0, N, batch_size): # get the batch-size dataset
            end = start+batch_size
            dw_sum, db_sum = 0, 0
            batch_X = X_shuffled[start:end+1]
            batch_y = y_shuffled[start:end+1]
            B = len(batch_X)

            for i in range(B): # iterate all samples in this mini-batch
                dw, db = gradients(batch_X[i], batch_y[i], w, b) # 如果不设置i的话 会导致计算gradients的时候维度不一致
                dw_sum+=dw
                db_sum+=db
            
            # 每次得到一个batch就更新一次参数
            w-=lr*dw_sum/B
            b-=lr*db_sum/B
        
        mse = mse_loss(batch_X, batch_y, w, b)
        print(f"Epoch: {ep}, w: {w}, b: {b}, MSE Loss: {mse}")
    return w, b

mini_batch_sgd(X, y)

Epoch: 0, w: 2.15, b: 0.81, MSE Loss: 0.09009999999999999
Epoch: 1, w: 2.0858666666666665, b: 0.7967333333333334, MSE Loss: 0.008367435555555539
Epoch: 2, w: 2.0801156666666665, b: 0.8048980000000001, MSE Loss: 0.007218910123611115
Epoch: 3, w: 2.059086793333333, b: 0.8050358694444446, MSE Loss: 0.010087603500074453
Epoch: 4, w: 2.057397136425, b: 0.8138169858111114, MSE Loss: 0.009234921238588685
Epoch: 5, w: 2.066522397706287, b: 0.8250059910101021, MSE Loss: 0.006184965916349388
Epoch: 6, w: 2.0453586116254487, b: 0.8230091200398606, MSE Loss: 0.0008468930989094966
Epoch: 7, w: 2.0505090542867643, b: 0.8305960064820263, MSE Loss: 0.0028707456393066676
Epoch: 8, w: 2.0487696215070588, b: 0.8354454471818631, MSE Loss: 0.0027113811233716147
Epoch: 9, w: 2.0626535125200234, b: 0.8458942114260517, MSE Loss: 0.004596443788262421
Epoch: 10, w: 2.0574925155409476, b: 0.8483285558534149, MSE Loss: 0.0008893934019027877
Epoch: 11, w: 2.055903020409974, b: 0.8519132624035453, MSE Loss: 0.00085

(np.float64(2.018477211629038), np.float64(0.9525837735222223))

### 1.3 SGD

In [33]:
def sgd(X, y, lr=0.1, epochs=50):
    w, b = 0, 0
    N = len(X)

    for ep in range(epochs):
        for i in range(N):
            dw, db = gradients(X[i], y[i], w, b)
            # 每次得到一个样本就更新一次参数
            w-=lr*dw
            b-=lr*db
        mse = mse_loss(X, y, w, b)
            
        print(f"Epoch: {ep}, w: {w}, b: {b}, MSE: {mse}")
    return w, b

sgd(X, y)

Epoch: 0, w: 1.9993999999999998, b: 1.0030999999999999, MSE: 3.0099999999977384e-06
Epoch: 1, w: 1.9994429000000002, b: 1.00287835, MSE: 2.59495787250237e-06
Epoch: 2, w: 1.9994827326500002, b: 1.0026725479749998, MSE: 2.2371449701169786e-06
Epoch: 3, w: 1.9995197172655248, b: 1.0024814607947872, MSE: 1.928670083761853e-06
Epoch: 4, w: 1.99955405748104, b: 1.00230403634796, MSE: 1.6627301054205377e-06
Epoch: 5, w: 1.9995859423711462, b: 1.002139297749081, MSE: 1.4334599923297457e-06
Epoch: 6, w: 1.999615547491609, b: 1.0019863379600216, MSE: 1.2358034192714807e-06
Epoch: 7, w: 1.999643035845959, b: 1.0018443147958798, MSE: 1.0654012663448599e-06
Epoch: 8, w: 1.9996685587829728, b: 1.0017124462879745, MSE: 9.184954828815488e-07
Epoch: 9, w: 1.9996922568299904, b: 1.0015900063783845, MSE: 7.91846207362491e-07
Epoch: 10, w: 1.9997142604666456, b: 1.0014763209223299, MSE: 6.826603154812293e-07
Epoch: 11, w: 1.9997346908432805, b: 1.0013707639763834, MSE: 5.885298205654958e-07
Epoch: 12, w:

(np.float64(1.9999841705893395), np.float64(1.000081785288414))

## 2. Momentum

In [34]:
def mini_batch_sgd_momentum(X, y, momentum=0.5, lr=0.1, batch_size=2, epochs=50):
    N = len(X)
    w, b = 0, 0
    vw_sum, vb_sum = 0, 0
    for ep in range(epochs):
        indices = np.random.permutation(N)
        X_shuffled = X[indices]
        y_shuffled = y[indices]
        for start in range(0, N, batch_size):
            end = start+batch_size
            X_batch = X_shuffled[start:end+1]
            y_batch = y_shuffled[start:end+1]
            dw_sum, db_sum = 0, 0
            for i in range(len(X_batch)):
                dw, db = gradients(X_batch[i], y_batch[i], w, b)
                dw_sum+=dw
                db_sum+=db
            
            vw_sum = momentum*vw_sum+dw_sum
            vb_sum = momentum*vb_sum+db_sum

            # Attention: need to divide by the number of samples in the batch
            # otherwise the loss function can not converge
            w-=lr*vw_sum/len(X_batch)
            b-=lr*vb_sum/len(X_batch)

            mse = mse_loss(X_batch, y_batch, w, b)
            print(f"Epoch: {ep}, w: {w}, b: {b}, MSE: {mse}")
    return w, b

mini_batch_sgd_momentum(X, y)


Epoch: 0, w: 1.6333333333333335, b: 0.5666666666666668, MSE: 1.8703703703703694
Epoch: 0, w: 3.205, b: 1.1266666666666667, MSE: 10.217373611111114
Epoch: 1, w: 2.650777777777778, b: 0.9793333333333333, MSE: 3.599131810699589
Epoch: 1, w: 1.8172722222222224, b: 0.7082055555555555, MSE: 0.5687701148302455
Epoch: 2, w: 1.8036122222222222, b: 0.7018274074074072, MSE: 0.8130771516124836
Epoch: 2, w: 1.8871900555555554, b: 0.756319222222222, MSE: 0.17366437538215004
Epoch: 3, w: 2.0508751425925924, b: 0.8251735586419752, MSE: 0.007175418247083797
Epoch: 3, w: 2.1831666748456793, b: 0.8841219265740741, MSE: 0.0962778761832167
Epoch: 4, w: 2.126085397004115, b: 0.8726202990967078, MSE: 0.052558252158943275
Epoch: 4, w: 2.0331638557960905, b: 0.8452106993279835, MSE: 0.005441646024424877
Epoch: 5, w: 2.015092813159623, b: 0.843814863119856, MSE: 0.014987740295971672
Epoch: 5, w: 2.0252301519784894, b: 0.8553679370198501, MSE: 0.009504880766375842
Epoch: 6, w: 2.047609736899617, b: 0.86611312235

(np.float64(2.0019125425085567), np.float64(0.9926677908774677))

In [35]:
def batch_gd_momentum(X, y, momentum=0.5, lr=0.1, epochs=50):
    N = len(X)
    w, b = 0, 0
    vw_sum, vb_sum = 0, 0
    for ep in range(epochs):
        dw_sum, db_sum = 0, 0
        for i in range(N):
            dw, db = gradients(X[i], y[i], w, b)
            dw_sum+=dw
            db_sum+=db
        
        vw_sum = momentum*vw_sum+dw_sum
        vb_sum = momentum*vb_sum+db_sum

        w-=lr*vw_sum
        b-=lr*vb_sum

        mse = mse_loss(X, y, w, b)
        print(f"Epoch: {ep}, w: {w}, b: {b}, MSE: {mse}")
    return w, b

batch_gd_momentum(X, y)

Epoch: 0, w: 7.0, b: 2.4000000000000004, MSE: 224.45999999999998
Epoch: 1, w: -5.9, b: -1.959999999999999, MSE: 593.7566
Epoch: 2, w: 14.310000000000004, b: 4.944000000000002, MSE: 1394.829086000001
Epoch: 3, w: -16.45900000000001, b: -5.4916000000000045, MSE: 3196.7932000600035
Epoch: 4, w: 30.02510000000003, b: 10.346240000000009, MSE: 7287.545480332614
Epoch: 5, w: -40.154390000000056, b: -13.498436000000016, MSE: 16593.512746157092
Epoch: 6, w: 65.7174710000001, b: 22.532990400000042, MSE: 37773.178962111815
Epoch: 7, w: -94.03200190000018, b: -31.781963560000055, MSE: 85981.33548615978
Epoch: 8, w: 146.97123091000034, b: 50.20534678400011, MSE: 195712.89804198808
Epoch: 9, w: -216.64619219900047, b: -73.4543676676002, MSE: 445485.4530037249
Epoch: 10, w: 331.9380405111008, b: 113.1437143726402, MSE: 1014021.9288743057
Epoch: 11, w: -495.72767903979127, b: -168.3527708673965, MSE: 2308134.4875319297
Epoch: 12, w: 752.9752691715331, b: 256.367773899335, MSE: 5253816.025459669
Epoch:

(np.float64(-3048387648.577673), np.float64(-1036823540.9672189))

In [36]:
def sgd_momentum(X, y, momentum=0.5, lr=0.1, epochs=50):
    # momentum如果设置为0.9，会导致MSE特别大，这是因为step过大，导致直接跳过了最优点
    N = len(X)
    w, b = 0, 0
    vw, vb = 0, 0
    for ep in range(epochs):
        indices = np.random.permutation(N)
        for i in indices:
            dw, db = gradients(X[i], y[i], w, b)
            
            vw = momentum*vw+dw
            vb = momentum*vb+db
            w-= lr*vw
            b-= lr*vb
        
        mse = mse_loss(X, y, w, b)
        print(f"Eppch: {ep}, w: {w}, b: {b}, MSE: {mse}")
    return w, b

sgd_momentum(X, y)


Eppch: 0, w: 1.6581000000000001, b: 1.0594000000000001, MSE: 0.7787011349999992
Eppch: 1, w: 1.9471649699999998, b: 1.3185866400000001, MSE: 0.038271326739750235
Eppch: 2, w: 1.7968935278500002, b: 1.2709395121600002, MSE: 0.10765216956434041
Eppch: 3, w: 2.0125502485057605, b: 1.305682791299798, MSE: 0.1138052594022433
Eppch: 4, w: 2.011005956646003, b: 1.2499321273284774, MSE: 0.0771282621727434
Eppch: 5, w: 1.9120302037943013, b: 1.2053542194955826, MSE: 0.00988564910314747
Eppch: 6, w: 1.9271990823799097, b: 1.1855529931690987, MSE: 0.006637574472141715
Eppch: 7, w: 1.9830282690198713, b: 1.1757985680857959, MSE: 0.01814740391326025
Eppch: 8, w: 1.9082336022260342, b: 1.1433076645324654, MSE: 0.01794098417910113
Eppch: 9, w: 1.957576328770666, b: 1.1423927079526563, MSE: 0.003569835244209805
Eppch: 10, w: 1.9823899035896204, b: 1.1257928580435757, MSE: 0.007073587562219843
Eppch: 11, w: 1.997162826134735, b: 1.1117812645737888, MSE: 0.010969708363903211
Eppch: 12, w: 1.953791776691

(np.float64(1.9999540810681369), np.float64(1.000112612349333))

## 3. Adagrad

In [37]:
def sgd_adagrad(X, y, epochs = 50, lr = 0.1, eps = 1e-6):
    w, b = 0, 0
    N = len(X)
    Gw, Gb = 0, 0
    for ep in range(epochs):
        shuffle_N = np.random.permutation(N)
        for i in shuffle_N: # shuffle
            dw, db = gradients(X[i], y[i], w, b)

            Gw+=dw**2
            Gb+=db**2
            
            w-=lr*dw/(np.sqrt(Gw)+eps)
            b-=lr*db/(np.sqrt(Gb)+eps)

        mse = mse_loss(X, y, w, b)
        print(f"Epoch: {ep}, w: {w}, b: {b}, MSE: {mse}")
    return w, b

sgd_adagrad(X, y)

Epoch: 0, w: 0.17612191929937315, b: 0.21913829041160254, MSE: 32.67971218983632
Epoch: 1, w: 0.2932076224050533, b: 0.3592128697536784, MSE: 27.72761274573771
Epoch: 2, w: 0.380477528327587, b: 0.4608362078536659, MSE: 24.328034753116537
Epoch: 3, w: 0.4548081381101954, b: 0.5467092331093564, MSE: 21.61471271507297
Epoch: 4, w: 0.5189618964665872, b: 0.6204043554182196, MSE: 19.406124902074033
Epoch: 5, w: 0.5751963624271446, b: 0.6853312626588424, MSE: 17.56621276453688
Epoch: 6, w: 0.6261815395170994, b: 0.7441051554360575, MSE: 15.97857619622319
Epoch: 7, w: 0.672719479215069, b: 0.7974801192530284, MSE: 14.597569622446862
Epoch: 8, w: 0.715523824262636, b: 0.846243887779375, MSE: 13.385214102428796
Epoch: 9, w: 0.7552557468401044, b: 0.8916372852829402, MSE: 12.306573728254575
Epoch: 10, w: 0.7923696686944118, b: 0.93386039023317, MSE: 11.341518070425138
Epoch: 11, w: 0.8270605158012813, b: 0.9731579353520108, MSE: 10.476543835692445
Epoch: 12, w: 0.8597742532364258, b: 1.01041793

(np.float64(1.442959817504202), np.float64(1.6483662055692223))

## 4. RMSProp

In [38]:
def sgd_rmsprop(X, y, lr=0.1, epochs=50, eps=1e-6, weight_decay=0.9):
    w, b = 0, 0
    sw, sb = 0, 0
    N = len(X)

    for ep in range(epochs):
        shuffle_N = np.random.permutation(N)
        for i in shuffle_N:
            dw, db = gradients(X[i], y[i], w, b)

            sw = weight_decay*sw+(1-weight_decay)*dw**2
            sb = weight_decay*sb+(1-weight_decay)*db**2

            w-=lr*dw/(np.sqrt(sw)+eps)
            b-=lr*db/(np.sqrt(sb)+eps)

        mse = mse_loss(X, y, w, b)
        print(f"Epoch: {ep}, w: {w}, b: {b}, MSE: {mse}")
    return w, b

sgd_rmsprop(X, y)


Epoch: 0, w: 0.6576958526835502, b: 0.7176228882684322, MSE: 15.488269853424542
Epoch: 1, w: 0.9939705407528008, b: 1.1145014351614908, MSE: 7.027866040710419
Epoch: 2, w: 1.2269951069483698, b: 1.3749916190129012, MSE: 3.172791167691508
Epoch: 3, w: 1.4123721579835218, b: 1.576406814376032, MSE: 1.2284799587292965
Epoch: 4, w: 1.5382076067527333, b: 1.6989591705362772, MSE: 0.47406538981418817
Epoch: 5, w: 1.6313944197735357, b: 1.7793466349812035, MSE: 0.190049137838243
Epoch: 6, w: 1.6730365664140057, b: 1.7876331948506925, MSE: 0.13451793242483595
Epoch: 7, w: 1.701553749263958, b: 1.7741264075251284, MSE: 0.11212230955758479
Epoch: 8, w: 1.731838141704961, b: 1.7522477195817625, MSE: 0.09658676654157208
Epoch: 9, w: 1.7768712743540604, b: 1.7431262316351708, MSE: 0.096570762413022
Epoch: 10, w: 1.8050243465620182, b: 1.7043254440854538, MSE: 0.09455905347014128
Epoch: 11, w: 1.8335080848024985, b: 1.6618650686000553, MSE: 0.09498613836120144
Epoch: 12, w: 1.8636421115695603, b: 1.

(np.float64(2.003441812465167), np.float64(1.012843811094733))

## 5. Adam

In [39]:
def sgd_adam(X, y, beta1=0.9, beta2=0.999, lr=0.1, epochs=50, eps=1e-8):
    w, b = 0, 0
    N = len(X)
    mw, mb = 0, 0
    vw, vb = 0, 0
    t = 0 # time step 
    # we have to record each update
    for ep in range(epochs):
        N_shuffle = np.random.permutation(N)
        for i in N_shuffle:
            t+=1
            dw, db = gradients(X[i], y[i], w, b)

            # 一阶矩
            mw = beta1*mw+(1-beta1)*dw
            mb = beta1*mb+(1-beta1)*db

            # 二阶矩
            vw = beta2*vw+(1-beta2)*(dw**2)
            vb = beta2*vb+(1-beta2)*(db**2)

            # debias
            mw_hat = mw/(1-beta1**t)
            mb_hat = mb/(1-beta1**t)

            vw_hat = vw/(1-beta2**t)
            vb_hat = vb/(1-beta2**t)

            # update
            w-=lr*mw_hat/(np.sqrt(vw_hat)+eps)
            b-=lr*mb_hat/(np.sqrt(vb_hat)+eps)

        mse = mse_loss(X, y, w, b)

        print(f"Epoch: {ep}, w: {w}, b: {b}, MSE: {mse}")
    return w, b

sgd_adam(X, y)
            

Epoch: 0, w: 0.3729277546647118, b: 0.3915724700246721, MSE: 25.175192482388802
Epoch: 1, w: 0.6721592269576225, b: 0.74521540859021, MSE: 14.9801904212575
Epoch: 2, w: 0.9861930229907389, b: 1.0957168229192693, MSE: 7.2325041954711065
Epoch: 3, w: 1.2634987784635405, b: 1.4065824470066721, MSE: 2.7363223117706763
Epoch: 4, w: 1.4971623686351943, b: 1.6688713285054662, MSE: 0.6620631079033326
Epoch: 5, w: 1.6844941461009324, b: 1.8747117030603442, MSE: 0.13181682835726838
Epoch: 6, w: 1.820525078551944, b: 2.016576868945853, MSE: 0.3627626176918099
Epoch: 7, w: 1.8960180863643858, b: 2.0865838028999253, MSE: 0.69683083269441
Epoch: 8, w: 1.9214663083166768, b: 2.0942907601966887, MSE: 0.8140353574564506
Epoch: 9, w: 1.9081616101400498, b: 2.0542855181849564, MSE: 0.6906557055311854
Epoch: 10, w: 1.866283465431308, b: 1.9851754739902996, MSE: 0.4460003000597063
Epoch: 11, w: 1.8122814147718507, b: 1.9041898068716916, MSE: 0.23318005453276405
Epoch: 12, w: 1.7613196222300402, b: 1.823850

(np.float64(1.9212268547893683), np.float64(1.217903125094386))