### 封装我们自己的SGD

In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
m = 100000

x = np.random.normal(size=m)
X = x.reshape(-1,1)
y = 4.*x + 3. + np.random.normal(0, 3, size=m)

In [3]:
from playML.LinearRegression import LinearRegression

## sgd算法的更新和实现

def sgd(X_b, y, initial_theta, n_iters=5, t0=5, t1=50): // n_iters=5 意思是所有的数都要看5遍

    def learning_rate(t):
        return t0 / (t + t1)

    theta = initial_theta
    m = len(X_b)
    for i_iter in range(n_iters):
        //打乱索引
        indexes = np.random.permutation(m)
        X_b_new = X_b[indexes,:]
        y_new = y[indexes]
        //乱序里面遍历所有的m，确保每个方向都试了一次，总共遍历了5次
        for i in range(m):
            gradient = dJ_sgd(theta, X_b_new[i], y_new[i])
            theta = theta - learning_rate(i_iter * m + i) * gradient

    return theta

In [4]:
lin_reg = LinearRegression()
lin_reg.fit_bgd(X, y)
print(lin_reg.intercept_, lin_reg.coef_)

3.0009417894233965 [4.00214259]


In [5]:
lin_reg = LinearRegression()
lin_reg.fit_sgd(X, y, n_iters=2)
print(lin_reg.intercept_, lin_reg.coef_)

2.9889792122708623 [3.99078933]


### 真实使用我们自己的SGD

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets

data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

X = data
y = target
X = X[y < 50.0]
y = y[y < 50.0]

In [7]:
from playML.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, seed=666)

In [8]:
from sklearn.preprocessing import StandardScaler

standardScaler = StandardScaler()
standardScaler.fit(X_train)
X_train_standard = standardScaler.transform(X_train)
X_test_standard = standardScaler.transform(X_test)

In [9]:
from playML.LinearRegression import LinearRegression

lin_reg = LinearRegression()
%time lin_reg.fit_sgd(X_train_standard, y_train, n_iters=2)
lin_reg.score(X_test_standard, y_test)

CPU times: total: 15.6 ms
Wall time: 7.88 ms


0.7857275413602652

In [10]:
%time lin_reg.fit_sgd(X_train_standard, y_train, n_iters=50)
lin_reg.score(X_test_standard, y_test)

CPU times: total: 172 ms
Wall time: 185 ms


0.808560757055621

In [11]:
%time lin_reg.fit_sgd(X_train_standard, y_train, n_iters=100)
lin_reg.score(X_test_standard, y_test)

CPU times: total: 359 ms
Wall time: 360 ms


0.8129434245278827

### scikit-learn中的SGD

In [17]:
from sklearn.linear_model import SGDRegressor

sgd_reg = SGDRegressor()
%time sgd_reg.fit(X_train_standard, y_train)
sgd_reg.score(X_test_standard, y_test)

CPU times: total: 31.2 ms
Wall time: 2.99 ms


0.8128942688369426

In [22]:
sgd_reg = SGDRegressor(n_iter_no_change=150)
%time sgd_reg.fit(X_train_standard, y_train)
sgd_reg.score(X_test_standard, y_test)

CPU times: total: 0 ns
Wall time: 14.2 ms


0.8131364346197649