## Dataset Preparation

In [26]:
import numpy as np

X = np.array([1, 2, 3, 4]) # shape=(4, )
y = np.array([3, 5, 7, 9]) # y=2x+1


## 1. Gradient Descent

### 1.1 Batch Gradient Descent

In [27]:
def predict(x, w, b):
    return w*x+b

def gradients(x, y, w, b):
    y_pred = predict(x, w, b)
    dw = (y_pred-y)*x
    db = (y_pred-y)
    return dw, db

def mse_loss(x, y, w, b):
    prediction = predict(x, w, b)
    return ((prediction-y)**2).mean()

In [28]:
def batch_gd(X, y, lr=0.1, epochs=50):
    w, b = 0.0, 0.0
    N = len(X) # the number of dataset

    for ep in range(epochs):
        dw_sum, db_sum = 0.0, 0.0
        for i in range(N): # iterate all data points in the dataset
            dw, db = gradients(X[i], y[i], w, b)
            dw_sum+=dw
            db_sum+=db

        # average gradient
        w-=lr*dw_sum/N
        b-=lr*db_sum/N
        mse = mse_loss(X, y, w, b)
        print(f"Epoch {ep}, w: {w}, b: {b}, MSE Loss: {mse}")
    return w, b

batch_gd(X, y)

Epoch 0, w: 1.75, b: 0.6000000000000001, MSE Loss: 1.1287500000000006
Epoch 1, w: 2.0375, b: 0.7025000000000001, MSE Loss: 0.0432718749999998
Epoch 2, w: 2.0837499999999998, b: 0.722875, MSE Loss: 0.013357640625000014
Epoch 3, w: 2.09021875, b: 0.7296500000000001, MSE Loss: 0.01218159857421868
Epoch 4, w: 2.0901421875, b: 0.7341303125000002, MSE Loss: 0.011798419380029259
Epoch 5, w: 2.08900296875, b: 0.7381817343750001, MSE Loss: 0.011447252994228812
Epoch 6, w: 2.08770530859375, b: 0.7421128187500001, MSE Loss: 0.011107082849989253
Epoch 7, w: 2.0863981224609374, b: 0.7459752097265626, MSE Loss: 0.010777036118485434
Epoch 8, w: 2.085105728183594, b: 0.749778158138672, MSE Loss: 0.010456797121681672
Epoch 9, w: 2.0838318925112307, b: 0.7535239102789063, MSE Loss: 0.010146074018873748
Epoch 10, w: 2.0825769955580813, b: 0.757213546123208, MSE Loss: 0.009844584034832125
Epoch 11, w: 2.0813408623587186, b: 0.7608479426213669, MSE Loss: 0.00955205280768426
Epoch 12, w: 2.0801232299343377,

(np.float64(2.0458560156699397), np.float64(0.8651777219412482))

### 1.2 Mini-Batch SGD

In [29]:
def mini_batch_sgd(X, y, lr=0.1, batch_size =2, epochs=50):
    w = 0.0
    b = 0.0
    N = len(X)

    for ep in range(epochs):
        # shuffle
        indices = np.random.permutation(N)
        X_shuffled = X[indices]
        y_shuffled = y[indices]

        # 从shuffle后的数据中获取batch，并且遍历这个batch总共epochs次
        for start in range(0, N, batch_size): # get the batch-size dataset
            end = start+batch_size
            dw_sum, db_sum = 0, 0
            batch_X = X_shuffled[start:end+1]
            batch_y = y_shuffled[start:end+1]
            B = len(batch_X)

            for i in range(B): # iterate all samples in this mini-batch
                dw, db = gradients(batch_X[i], batch_y[i], w, b) # 如果不设置i的话 会导致计算gradients的时候维度不一致
                dw_sum+=dw
                db_sum+=db
            
            # 每次得到一个batch就更新一次参数
            w-=lr*dw_sum/B
            b-=lr*db_sum/B
        
        mse = mse_loss(batch_X, batch_y, w, b)
        print(f"Epoch: {ep}, w: {w}, b: {b}, MSE Loss: {mse}")
    return w, b

mini_batch_sgd(X, y)

Epoch: 0, w: 1.995, b: 0.7666666666666666, MSE Loss: 0.0604902777777779
Epoch: 1, w: 2.05885, b: 0.8009, MSE Loss: 0.010493876250000022
Epoch: 2, w: 2.058465, b: 0.8074165833333333, MSE Loss: 0.009845753010423591
Epoch: 3, w: 2.0679317543055555, b: 0.8151881047222221, MSE Loss: 0.0013781564020913693
Epoch: 4, w: 2.0446728078092593, b: 0.8124572222435186, MSE Loss: 0.001471603191045254
Epoch: 5, w: 2.070642496538982, b: 0.827365902870662, MSE Loss: 0.005692526457903304
Epoch: 6, w: 2.0508497560172603, b: 0.8257898866742223, MSE Loss: 0.0030548899049989272
Epoch: 7, w: 2.0510340471803175, b: 0.8342463789877562, MSE Loss: 0.007316901062058425
Epoch: 8, w: 2.048836201447731, b: 0.8381897440886142, MSE Loss: 0.0026191151125577296
Epoch: 9, w: 2.0475956652118925, b: 0.8432051807821385, MSE Loss: 0.006526299170480884
Epoch: 10, w: 2.05585770321142, b: 0.8502850673937674, MSE Loss: 0.0008814392384633646
Epoch: 11, w: 2.043774493996368, b: 0.8507563895406949, MSE Loss: 0.002237337329026614
Epoc

(np.float64(2.0168945694460385), np.float64(0.954208465744693))

### 1.3 SGD

In [30]:
def sgd(X, y, lr=0.1, epochs=50):
    w, b = 0, 0
    N = len(X)

    for ep in range(epochs):
        for i in range(N):
            dw, db = gradients(X[i], y[i], w, b)
            # 每次得到一个样本就更新一次参数
            w-=lr*dw
            b-=lr*db
        mse = mse_loss(X, y, w, b)
            
        print(f"Epoch: {ep}, w: {w}, b: {b}, MSE: {mse}")
    return w, b

sgd(X, y)

Epoch: 0, w: 1.9993999999999998, b: 1.0030999999999999, MSE: 3.0099999999977384e-06
Epoch: 1, w: 1.9994429000000002, b: 1.00287835, MSE: 2.59495787250237e-06
Epoch: 2, w: 1.9994827326500002, b: 1.0026725479749998, MSE: 2.2371449701169786e-06
Epoch: 3, w: 1.9995197172655248, b: 1.0024814607947872, MSE: 1.928670083761853e-06
Epoch: 4, w: 1.99955405748104, b: 1.00230403634796, MSE: 1.6627301054205377e-06
Epoch: 5, w: 1.9995859423711462, b: 1.002139297749081, MSE: 1.4334599923297457e-06
Epoch: 6, w: 1.999615547491609, b: 1.0019863379600216, MSE: 1.2358034192714807e-06
Epoch: 7, w: 1.999643035845959, b: 1.0018443147958798, MSE: 1.0654012663448599e-06
Epoch: 8, w: 1.9996685587829728, b: 1.0017124462879745, MSE: 9.184954828815488e-07
Epoch: 9, w: 1.9996922568299904, b: 1.0015900063783845, MSE: 7.91846207362491e-07
Epoch: 10, w: 1.9997142604666456, b: 1.0014763209223299, MSE: 6.826603154812293e-07
Epoch: 11, w: 1.9997346908432805, b: 1.0013707639763834, MSE: 5.885298205654958e-07
Epoch: 12, w:

(np.float64(1.9999841705893395), np.float64(1.000081785288414))

## 2. Momentum