In [2]:
import os
os.sys.path.append(os.path.dirname(os.path.abspath('.')))

## 数据准备

In [1]:
import numpy as np
from datasets.dataset import load_boston
from model_selection.train_test_split import train_test_split

data=load_boston()
X=data.data
Y=data.target

X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2)

# 为便于numpy矩阵乘法之间的维度维护，统一将Y转换成列向量，与一行一个样本对应
Y_train=Y_train.reshape((-1,1))
Y_test=Y_test.reshape((-1,1))

# print(X_train.shape,Y_train.shape)

数据$X$是一个$(n{\times}m)$的矩阵，每一行是一个样本，每一列代表一个特征：

In [3]:
n_sample=X_train.shape[0]
n_feature=X_train.shape[1]

## 粗略模型

模型表达式为：
$$
\hat{y}=wx+b
$$
在向量化操作时，其中的$y$、$w$、$x$都会被矩阵形式替代。为了便于与后期深度学习的一致性，设$X$的维度为$(n\_sample,n\_feature)$，设$W$的维度为$(n\_feature,n\_output)$，偏置系数$b$为单变量系数。模型可以写成：
$$
\hat{Y}=XW+b
$$

In [4]:
W = np.random.randn(n_feature).reshape((n_feature, 1))  # 权重，列向量
b = 1  # 偏置

Y_hat=np.dot(X_train, W)+b

# print(W.shape,Y_hat.shape)

(13, 1) (404, 1)


模型的损失函数为：
$$
\begin{align}
L&=\sum\limits_{i=1}^n(y^{(i)}-\hat{y}^{(i)})^{2} \\
&=\frac{1}{n}(Y-\hat{Y})^{T}(Y-\hat{Y}) \\
\end{align}
$$
损失函数关于参数$W$与$b$的梯度可以求得：
$$
\begin{align}
\frac{\partial{L}}{\partial{W}}&=\frac{2}{n}[X^{T}(\hat{Y}-Y)] \\
\frac{\partial{L}}{\partial{b}}&=\frac{2}{n}[1,1,...,1]{\cdot}(\hat{Y}-Y) \\
\end{align}
$$

In [6]:
dW = 2 * X_train.T.dot(Y_hat - Y_train) / n_sample
db = 2 * np.sum(Y_hat - Y_train)/ n_sample

# print((Y_hat - Y_train).shape,dW.shape,db.shape)

参数的迭代更新公式：
$$
W:=W-{\alpha}\frac{\partial{L}}{\partial{W}}, \quad b:b-{\alpha}\frac{\partial{L}}{\partial{b}}
$$

In [8]:
max_iter=2000
alpha=0.000001        # 注意学习率过大会导致震荡，然后误差越来越大

for i in range(max_iter):
    Y_hat=np.dot(X_train, W)+b
    
    dW = 2 * X_train.T.dot(Y_hat - Y_train) / n_sample
    db = 2 * np.sum(Y_hat - Y_train)/ n_sample
    
    W = W - alpha * dW
    b = b - alpha * db

使用该模型分别对训练集与预测集做预测：

In [9]:
Y_pred_train=np.dot(X_train, W) + b
Y_pred_test=np.dot(X_test, W) + b

定义一个RMSE损失函数来评价模型的表现：

In [10]:
def RMSE(Y_true,Y_pred):
    return np.sum((Y_true-Y_pred)**2)**0.5/len(Y_true)

print(RMSE(Y_train,Y_pred_train),RMSE(Y_test,Y_pred_test))

0.6455291230381478 1.320817214607008


模型简单打包：

In [11]:
def linear_reg(X,Y,alpha=0.000001,max_iter=2000):
    Y=Y.reshape((-1,1))
    
    n_sample=X.shape[0]
    n_feature=X.shape[1]
    
    W = np.random.randn(n_feature).reshape((n_feature,1))  # 权重
    b = 1  # 偏置
    
    for i in range(max_iter):
        Y_hat=np.dot(X, W)+b

        dW = 2 * X.T.dot(Y_hat - Y) / n_sample
        db = 2 * np.sum(Y_hat - Y) / n_sample

        W = W - alpha * dW
        b = b - alpha * db
        
        if i%200==0:
            Y_hat=np.dot(X, W)+b
            L=np.sum((Y-Y_hat)**2)**0.5/n_sample
            print(L,end='\t')

    return W,b

W,b=linear_reg(X_train,Y_train)

6.303090299154698	1.4524913644696893	1.3280720170673472	1.2393631194277546	1.173933489379412	1.123864363168316	1.0841649347118718	1.0516728174730992	1.0243467351198545	1.0008322128119318	

## 数据归一化
**Normalization：**
$$
x=\frac{x-x_{min}}{x_{max}-x_{min}}
$$

In [12]:
X=np.row_stack((X_train,X_test))

X_max=X.max(axis=0)
X_min=X.min(axis=0)

X_train_norm=(X_train-X_min)/(X_max-X_min)
X_test_norm=(X_test-X_min)/(X_max-X_min)

对数据归一化之后再测试模型表现：

In [13]:
W,b=linear_reg(X_train_norm,Y_train)

1.1044838755124307	1.1032449829611544	1.1020083872051087	1.100774084824075	1.0995420724019218	1.098312346526596	1.0970849037901154	1.0958597407885595	1.094636854122063	1.0934162403948062	

因为数据做了归一化，整个数据集上的梯度分布得到了改良，所以可以调大学习率，由此可以看出数据标准化在线性回归上的威力：

In [14]:
W,b=linear_reg(X_train_norm,Y_train,alpha=0.1)

0.6191387519019058	0.2560444784882639	0.24100717213845588	0.23728081441014445	0.23548376198145338	0.23432509017368958	0.2335130137195818	0.23292723815477373	0.23249816815292773	0.23218043458810245	

**Standardization：**
$$
x=\frac{x-x_{\mu}}{\sigma}
$$

In [15]:
X=np.row_stack((X_train,X_test))

X_avg=X.mean(axis=0)
X_std=X.std(axis=0)

X_train_std=(X_train-X_avg)/X_std
X_test_std=(X_test-X_avg)/X_std

In [16]:
W,b=linear_reg(X_train_std,Y_train)

1.1563238469968367	1.1557931272432078	1.1552629982206257	1.1547334581078446	1.154204505091039	1.1536761373637747	1.153148353126979	1.1526211505889112	1.152094527965133	1.1515684834784794	

In [17]:
W,b=linear_reg(X_train_std,Y_train,alpha=0.1)

0.9116086011026885	0.2312209165982425	0.23117815961078378	0.2311774977465675	0.23117748749898212	0.23117748734031987	0.2311774873378633	0.23117748733782525	0.2311774873378247	0.2311774873378247	

这个简单实验比较发现，相比于Normalization，Standardization能够更快地加速模型的收敛，这跟最小二乘法对于数据先验分布为正态分布的假设是一致的。

数据归一化工具简单打包：

In [18]:
def Standardization(X_train,X_test):
    X=np.row_stack((X_train,X_test))

    X_avg=X.mean(axis=0)
    X_std=X.std(axis=0)

    X_train_std=(X_train-X_avg)/X_std
    X_test_std=(X_test-X_avg)/X_std
    return X_train_std,X_test_std

## mini-batch梯度下降

In [19]:
def linear_reg(X,Y,alpha=0.000001,max_iter=2000,batch_size=32):
    Y=Y.reshape((-1,1))
    
    n=X.shape[0]
    m=X.shape[1]
    num_batch = n // batch_size
    
    W = np.random.rand(m).reshape((m, 1))  # 权重
    b = 1  # 偏置

    for epoch in range(max_iter):
        
        ######  mini-batch  ######
        for i in range(num_batch + 1):    # 有可能有多余的不完整batch，多循环一次
            start_index = i * batch_size
            end_index = (i + 1) * batch_size
            
            if start_index < n:
                # 切片操作不会引发越界
                X_batch = X[start_index:end_index + 1]
                Y_batch = Y[start_index:end_index + 1]
                
            n_batch=X_batch.shape[0]
            Y_hat_batch=np.dot(X_batch, W)+b

            dW = 2 * X_batch.T.dot(Y_hat_batch - Y_batch) / n_batch
            db = 2 * np.sum(Y_hat_batch - Y_batch) / n_batch
        
        
            W -= alpha * dW
            b -= alpha * db
        ######  mini-batch  ######
        
        if epoch%200==0:
            Y_hat=np.dot(X, W)+b
            L=np.sum((Y-Y_hat)**2)**0.5/n_sample
            print(L,end='\t')

    return W,b

W,b=linear_reg(X_train,Y_train)

6.100956335262958	1.1316041413155549	1.0006288374056365	0.8978036065957842	0.8163045383724775	0.7515903529855809	0.7004122054230549	0.6602646766232462	0.6290989351918346	0.6051841017882558	

单纯的mini-batch并没有很明显的提升模型表现，我们再加上Standardization：

In [20]:
X_train_std,X_test_std=Standardization(X_train,X_test)
W,b=linear_reg(X_train_std,Y_train,batch_size=32)

1.1983956793352732	1.1980861532565288	1.1977766928764577	1.1974672979226524	1.1971579681282514	1.1968487032318884	1.1965395029776387	1.1962303671149677	1.1959212953986793	1.1956122875888648	

额，表现并不是很理想，甚至在加大max_iter值后模型还是没有收敛，可能是数据量太小，mini-batch引入的随机性对模型的收敛起了一个反作用。