In [37]:
import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

In [38]:
data=load_boston()
X=data.data
Y=data.target

X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2)

数据$X$是一个$(n{\times}m)$的矩阵，每一行是一个样本，每一列代表一个特征；标签$Y$是一个列向量，其行数与$X$相同。模型表达式为：
$$
\hat{Y}=XW^{T}+b
$$
用代码描述即：
```
Y_hat=np.dot(X,W.T)+b
```

模型的损失函数为：
$$
\begin{align}
L&=\sum\limits_{i=1}^n(y^{(i)}-\hat{y}^{(i)})^{2} \\
&=\frac{1}{n}(Y-\hat{Y})^{T}(Y-\hat{Y}) \\
\end{align}
$$
用代码描述为：
```
L=np.dot((Y-Y_hat).T,Y-Y_hat)/2/n
```

损失函数关于参数$W$与$b$的梯度可以求得：
$$
\begin{align}
\frac{\partial{L}}{\partial{W}}&=\frac{2}{n}(\hat{Y}-Y)^{T}{\cdot}X \\
\frac{\partial{L}}{\partial{b}}&=\frac{2}{n}(\hat{Y}-Y)^{T}{\cdot}[1,1,...,1]^{T} \\
\end{align}
$$
用代码表示为：
```
dW=2*(Y_hat-Y).T.dot(X)/n
db=2*(Y_hat-Y).T.dot(np.ones((n,1)))/n
```

参数更新：
$$
W:=W-{\alpha}\frac{\partial{L}}{\partial{W}}, \quad b:b-{\alpha}\frac{\partial{L}}{\partial{b}}
$$
代码：
```
W=W-lr*dW
b=b-lr*db
```

In [65]:
class LinearRegression:
    def __init__(self, lr=0.00001, batch_size=32, max_iter=1000):
        self.lr = lr
        self.batch_size = batch_size
        self.max_iter = max_iter
        self.W = None
        self.b = None

    def fit(self, X, Y):
        X = X.copy()
        Y = Y.copy()

        n = X.shape[0]  # 样本数
        m = X.shape[1]  # 特征数
        assert Y.shape[0] == n  # 数据与标签应该相等
        Y = Y.reshape((n, 1))  # 标签，列向量

        self.W = np.random.rand(m).reshape((1, -1))  # 权重，行向量
        self.b = np.ones((1, 1))  # 偏置

        assert Y.shape == (n, 1)

        num_batch = n // self.batch_size

        for epoch in range(self.max_iter):
            for i in range(num_batch + 1):
                start_index = i * self.batch_size
                end_index = (i + 1) * self.batch_size
                if end_index <= n:
                    X_batch = X[start_index:end_index + 1]
                    Y_batch = Y[start_index:end_index + 1]
                else:
                    X_batch = X[start_index:]
                    Y_batch = Y[start_index:]

                Y_hat = X_batch.dot(self.W.T) + self.b
                dW = 2 * (Y_hat - Y_batch).T.dot(X_batch) / n
                db = 2 * (Y_hat - Y_batch).T.dot(np.ones((X_batch.shape[0], 1))) / n
                assert (dW.shape == self.W.shape) & (db.shape == self.b.shape)

                self.W = self.W - self.lr * dW
                self.b = self.b - self.lr * db

    def predict(self, X):
        X = X.copy()
        return np.squeeze(np.dot(X, self.W.T) + self.b)        # 将矩阵压缩成向量，与原始输入Y保持一致

    
line_reg=LinearRegression()
line_reg.fit(X_train,Y_train)

def RMSE(Y_true,Y_pred):
    return sum((Y_true-Y_pred)**2)**0.5

Y_pred=line_reg.predict(X_test)
RMSE(Y_test,Y_pred)

89.27020521140128