In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
np.random.seed(666)
X = np.random.random(size=(1000, 10))

In [3]:
true_theta = np.arange(1,12,dtype=float)

In [33]:
X_b = np.hstack([np.ones((len(X),1)), X])
y = X_b.dot(true_theta) + np.random.normal(size=1000)

In [34]:
X_b.shape

(1000, 11)

In [36]:
X_b[0]

array([1.        , 0.70043712, 0.84418664, 0.67651434, 0.72785806,
       0.95145796, 0.0127032 , 0.4135877 , 0.04881279, 0.09992856,
       0.50806631])

In [7]:
def J(theta, X_b, y):
    try:
        return np.sum((y - (X_b.dot(theta)))**2) / len(X_b)
    except:
        return float("inf")

In [8]:
# 求导
def dJ_math(theta, X_b, y):
    return X_b.T.dot(X_b.dot(theta) - y ) * 2. / len(y)

In [16]:
def dJ_debug(theta, X_b, y, epsilon=0.01):
    res = np.empty(len(theta))
    for i in range(len(theta)):
        theta_1 = theta.copy()
        theta_1[i] += epsilon
        theta_2 = theta.copy()
        theta_2[i] -= epsilon
        res[i] = (J(theta_1, X_b, y) - J(theta_2, X_b, y)) / (2. * epsilon)
    return res

In [10]:
# eta为步长
def gradient_descent(dJ, X_b, y, initial_theta, eta, n_iters= 1e4, epsilon=1e-8):
    theta = initial_theta
    cur_iter = 0
    
    while cur_iter < n_iters:
        gradient = dJ(theta, X_b, y)
        last_theta = theta
        theta = theta - eta * gradient
        
        # 判断是否本次梯度下降的值足够小，也就是判断是否已经拟合
        if(abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
            break
            
        cur_iter += 1
    return theta

In [17]:
X_b_1 = np.hstack([np.ones((len(X), 1)), X])
initial_theta = np.zeros(X_b_1.shape[1])
eta = 0.01

%time theta = gradient_descent(dJ_debug, X_b_1, y, initial_theta, eta)
theta

CPU times: user 3.76 s, sys: 0 ns, total: 3.76 s
Wall time: 3.76 s


array([ 1.1251597 ,  2.05312521,  2.91522497,  4.11895968,  5.05002117,
        5.90494046,  6.97383745,  8.00088367,  8.86213468,  9.98608331,
       10.90529198])

## 小批量梯度下降法的实现

In [68]:
import math

# 确定随机数种子
np.random.seed(0)

# 损失函数
def J(theta, X_b, y):
    try:
        return np.sum((y - (X_b.dot(theta)))**2) / len(X_b)
    except:
        return float("inf")

# 求导
def dJ_math(theta, X_b, y):
    return X_b.T.dot(X_b.dot(theta) - y ) * 2. / len(y)


# 小批量梯度下降法
def MBGD(X_b, y,  initial_theta, eta, k=10,epsilon=1e-8, n_iters=1e4):
    
    # 设置初始化的theta等于0
    theta = initial_theta
    
    # 将数据集索引打乱
    reset_index = np.random.permutation(len(X_b))
    
    cur_iter = 0
    length = len(X_b)
    
    while cur_iter < n_iters:
        for i in range(k):
            # 将乱序索引列表分割成等分
            one_list = reset_index[math.floor(i / k * length):math.floor((i + 1) / k * length)]
            
            new_X_b = X_b[one_list]
            new_y = y[one_list]
            
            gradient = dJ_math(theta, new_X_b, new_y)
            last_theta = theta 
            
            theta = theta - eta * gradient

            if J(last_theta, X_b, y) - J(theta, X_b, y) < epsilon:
                break
            break
            
        cur_iter += 1  

    return theta
    
    

In [70]:
X_b = np.hstack([np.ones((len(X), 1)), X])
initial_theta = np.zeros(X_b.shape[1])
eta = 0.01

theta = MBGD(X_b, y, initial_theta,eta,k=5)
theta

array([ 1.18830429,  2.39962112,  2.81408209,  3.64015308,  5.12514224,
        5.72406456,  7.09629007,  7.92383037,  8.89595404, 10.23240324,
       10.75768754])

In [18]:
%time gradient_descent(dJ_math, X_b_1, y, initial_theta, eta)

CPU times: user 2.03 s, sys: 20 ms, total: 2.05 s
Wall time: 520 ms


array([ 1.1251597 ,  2.05312521,  2.91522497,  4.11895968,  5.05002117,
        5.90494046,  6.97383745,  8.00088367,  8.86213468,  9.98608331,
       10.90529198])