# 批量梯度下降法 Batch Gradient Descent

###    $\nabla J(\theta)=\begin{pmatrix} 
    \frac{\partial J}{\partial \theta_0} 
    \\ \frac{\partial J}{\partial \theta_1}
    \\ \ldots
    \\ \frac{\partial J}{\partial \theta_n}
    \end{pmatrix} = \frac{2}{m} 
    \begin{pmatrix}\\ 
    \sum_{i=1}^{m}(X_b^{(i)}\theta - y^{(i)})
    \\ \sum_{i=1}^{m}(X_b^{(i)}\theta - y^{(i)}) X_1^{(i)}
    \\ \sum_{i=1}^{m}(X_b^{(i)}\theta - y^{(i)}) X_2^{(i)}
    \\ \ldots
    \\ \sum_{i=1}^{m}(X_b^{(i)}\theta - y^{(i)}) X_n^{(i)}
    \end{pmatrix} = \frac{2}{m}\cdot X_b^T\cdot(X_b\theta - y)$

# 随机梯度下降法 Stochastic Gradient Descent

### $\frac{2}{m} 
    \begin{pmatrix}\\ 
    \sum_{i=1}^{m}(X_b^{(i)}\theta - y^{(i)})
    \\ \sum_{i=1}^{m}(X_b^{(i)}\theta - y^{(i)}) X_1^{(i)}
    \\ \sum_{i=1}^{m}(X_b^{(i)}\theta - y^{(i)}) X_2^{(i)}
    \\ \ldots
    \\ \sum_{i=1}^{m}(X_b^{(i)}\theta - y^{(i)}) X_n^{(i)}
    \end{pmatrix}$ ----> $\begin{pmatrix}
(X_b^{(i)}\theta - y^{(i)}) X_0^{(i)}\\ 
(X_b^{(i)}\theta - y^{(i)}) X_1^{(i)}\\ 
(X_b^{(i)}\theta - y^{(i)}) X_2^{(i)}\\ 
\ldots\\ 
(X_b^{(i)}\theta - y^{(i)}) X_n^{(i)}\\ 
\end{pmatrix}$ $=2\cdot (X_b^{(i)})^T\cdot (X_b^{(i)}\theta - y^{(i)})$ 
    


#### 学习率 $\eta = \frac{1}{i\_iters}$ 随着迭代次数的上升而下降(模拟退火的思想)
#### 实际中用 $\eta = \frac{a}{i\_iters\ +\ b}$, a和b就是随机梯度下降法中使用的超参数
#### 经验数 a=5 , b=50

# 批量梯度下降法

In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
m = 100000

x = np.random.normal(size=m)
X = x.reshape(-1,1)
y = 4.*x + 3. + np.random.normal(0, 3, size=m)

In [3]:
def J(theta, X_b, y):
    try:
        return np.sum((y - X_b.dot(theta))**2)/len(y)
    except:
        return float('inf')

def dJ(theta, X_b, y):
    return X_b.T.dot(X_b.dot(theta) - y) * 2. / len(y)

def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8):
    theta = initial_theta
    i_iter = 0

    while i_iter < n_iters:
        gradient = dJ(theta, X_b, y)
        last_theta = theta
        theta = theta - eta * gradient

        if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
            break

        i_iter += 1
    return theta

In [7]:
%%time
X_b = np.hstack([np.ones((len(X), 1)), X])
initial_theta = np.zeros(X_b.shape[1])
eta = 0.01
theta = gradient_descent(X_b, y, initial_theta, eta)

Wall time: 1.01 s


In [8]:
theta

array([3.00506718, 4.00885108])

# 随机梯度下降法

In [10]:
def dJ_sgd(theta, X_b_i, y_i):
    return X_b_i.T.dot(X_b_i.dot(theta) - y_i) * 2.

In [11]:
def sgd(X_b, y, initial_theta, n_iters):
    
    t0 = 5 # 也就是学习率的分子 a
    t1 = 50 # 也就是学习率公式的分母 b
    
    def learning_rate(t):
        return t0/(t + t1)
    
    theta = initial_theta
    for cur_iter in range(n_iters):
        rand_i = np.random.randint(len(X_b))
        gradient = dJ_sgd(theta, X_b[rand_i], y[rand_i])
        theta = theta - learning_rate(cur_iter)*gradient
    
    return theta

In [12]:
%%time
X_b = np.hstack([np.ones((len(X), 1)), X])
initial_theta = np.zeros(X_b.shape[1])
theta = sgd(X_b, y, initial_theta, n_iters=len(X_b)//3)

Wall time: 708 ms


In [13]:
theta

array([2.99209387, 4.03048732])

# 使用自己封装的随机梯度下降法训练模型

In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
m = 100000

x = np.random.normal(size=m)
X = x.reshape(-1,1)
y = 4.*x + 3. + np.random.normal(0, 3, size=m)

In [3]:
from c8_LinearRegression import LinearRegression

In [4]:
lin_reg = LinearRegression()
lin_reg.fit_sgd(X, y, n_iters=2)

LinearRegression()

In [5]:
lin_reg.coef_

array([3.98346802])

In [6]:
lin_reg.interception_

3.000898613670543

# 将我们封装的随机梯度方法使用在真实数据上

In [7]:
from sklearn import datasets

In [8]:
boston = datasets.load_boston()

In [9]:
X = boston.data
y = boston.target

X = X[y<50.0]
y = y[y<50.0]

In [10]:
from c2_model_selection import train_test_split

In [11]:
X_train, y_train, X_test, y_test = train_test_split(X, y, seed=666)

In [12]:
# 梯度下降法, 需要 归一化 处理数据
from sklearn.preprocessing import StandardScaler

In [13]:
standard_scaler = StandardScaler()

In [14]:
standard_scaler.fit(X_train)
X_train_standard = standard_scaler.transform(X_train)
X_test_standard = standard_scaler.transform(X_test)

In [15]:
from c8_LinearRegression import LinearRegression

In [16]:
lin_Reg1 = LinearRegression()

In [18]:
%time lin_Reg1.fit_sgd(X_train_standard, y_train, n_iters=2)
lin_Reg1.score(X_test_standard, y_test)

Wall time: 18 ms


0.7923329555425147

In [19]:
# 增加迭代次数来提高训练精度
%time lin_Reg1.fit_sgd(X_train_standard, y_train, n_iters=50)
lin_Reg1.score(X_test_standard, y_test)

Wall time: 331 ms


0.8132440489440967

# scikit-learn中的SGD

In [20]:
from sklearn.linear_model import SGDRegressor
# 只能解决 线性 回归问题

In [21]:
sgd_reg = SGDRegressor()
%time sgd_reg.fit(X_train_standard, y_train)
sgd_reg.score(X_test_standard, y_test)

Wall time: 6 ms




0.8045807839066259

In [22]:
sgd_reg = SGDRegressor(n_iter=100)
%time sgd_reg.fit(X_train_standard, y_train)
sgd_reg.score(X_test_standard, y_test)

Wall time: 8.99 ms


0.8130817662958829