# 1：方差、偏差与训练集、测试集、验证集
### **by:MLZZY**
**实验描述：先对一个水库的流出水量以及水库水位进行正则化线性回归，再进一步研究方差/偏差的问题。**

**1：正则化线性回归**
**对水库的流出水量以及水库水位进行正则化线性回归**

In [2]:
import numpy as np
import scipy.io as sio
import scipy.optimize as opt
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
data=sio.loadmat('/home/mw/input/andrew_ml_ex55139/ex5data1.mat')
data

{'__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Fri Nov  4 22:27:26 2011',
 '__version__': '1.0',
 '__globals__': [],
 'X': array([[-15.93675813],
        [-29.15297922],
        [ 36.18954863],
        [ 37.49218733],
        [-48.05882945],
        [ -8.94145794],
        [ 15.30779289],
        [-34.70626581],
        [  1.38915437],
        [-44.38375985],
        [  7.01350208],
        [ 22.76274892]]),
 'y': array([[ 2.13431051],
        [ 1.17325668],
        [34.35910918],
        [36.83795516],
        [ 2.80896507],
        [ 2.12107248],
        [14.71026831],
        [ 2.61418439],
        [ 3.74017167],
        [ 3.73169131],
        [ 7.62765885],
        [22.7524283 ]]),
 'Xtest': array([[-33.31800399],
        [-37.91216403],
        [-51.20693795],
        [ -6.13259585],
        [ 21.26118327],
        [-40.31952949],
        [-14.54153167],
        [ 32.55976024],
        [ 13.39343255],
        [ 44.20988595],
        [ -1.14267768],
        [-

In [4]:
X,y,Xval,yval,Xtest,ytest=map(np.ravel,[data['X'],data['y'],data['Xval'],data['yval'],data['Xtest'],data['ytest']])
X.shape,y.shape,Xval.shape, yval.shape, Xtest.shape, ytest.shape

((12,), (12,), (21,), (21,), (21,), (21,))

In [5]:
fig,ax=plt.subplots(figsize=(12,8))
ax.scatter(X,y)
ax.set_xlabel('water_level')
ax.set_ylabel('flow')
plt.show()

In [6]:
X, Xval, Xtest = [np.insert(x.reshape(x.shape[0], 1), 0, np.ones(x.shape[0]), axis=1) for x in (X, Xval, Xtest)]

In [7]:
def cost(theta,X,y):
    # X: R(m*n), m个数据, n特征
    # y: R(m)
    # theta : R(n), 线性回归参数
    m=X.shape[0]
    inner=X@theta-y #R(m*1)
    square_sum=inner.T@inner
    cost=square_sum/(2*m)
    return cost

In [11]:
def costReg(theta,X,y,lamda=1):
    m=X.shape[0]
    regularized_term=(lamda/(2*m))*np.power(theta[1:],2).sum()
    return cost(theta,X,y)+regularized_term

In [12]:
theta=np.ones(X.shape[1])
costReg(theta,X,y,1)

303.9931922202643

In [13]:
def gradient(theta,X,y):
    m=X.shape[0]
    inner=X.T@(X@theta-y) # (n*1)
    return inner/m

In [14]:
def gradientReg(theta,X,y,lamda):
    m=X.shape[0]
    regularized_term=theta.copy()
    regularized_term[0]=0 #theta0不参与正则化项
    regularized_term=(lamda/m)*regularized_term
    return gradient(theta,X,y)+regularized_term

In [15]:
gradientReg(theta,X,y,1)

array([-15.30301567, 598.25074417])

In [16]:
#调用工具库函数，求theta的最优解
theta=np.ones(X.shape[1])
#令lamda=0，不使用正则化，因为theta是二维的，使用正则化对低维theta不会有很大的帮助
final_theta=opt.minimize(fun=costReg,x0=theta,args=(X,y,0),method='TNC',jac=gradientReg,options={'disp':True}).x
final_theta

array([13.08790362,  0.36777923])

In [17]:
b=final_theta[0]
m=final_theta[1]
fig,ax=plt.subplots(figsize=(12,8))
plt.scatter(X[:,1],y,c='r',label="Training data")
plt.plot(X[:,1],X[:,1]*m+b,c='b',label="Prediction")
ax.set_xlabel('water_level')
ax.set_ylabel('flow')
ax.legend()
plt.show()

In [18]:
def linear_regression(X,y,lamda=1):
    """
        X: feature matrix, (m, n+1) # with incercept x0=1
        y: target vector, (m, )
        l: lambda constant for regularization
    """
    theta=np.ones(X.shape[1])
    res=opt.minimize(fun=costReg,x0=theta,args=(X,y,lamda),method='TNC',jac=gradientReg,options={'disp':True})
    return res

In [19]:
training_cost,cv_cost=[],[]

In [20]:
m=X.shape[0]
for i in range(1,m+1):
    res=linear_regression(X[:i,:],y[:i],0)
    tc=costReg(res.x,X[:i,:],y[:i],0)
    cv=costReg(res.x,Xval,yval,0)
    training_cost.append(tc)
    cv_cost.append(cv)

In [21]:
fig,ax=plt.subplots(figsize=(12,8))
# x轴为数据的条数
plt.plot(np.arange(1,m+1),training_cost,label='training cost')
plt.plot(np.arange(1,m+1),cv_cost,label='cv cost')
plt.legend()
plt.show()

In [22]:
#由上图可知，该模型欠拟合
#线性回归对现有数据来说过于简单，会出现欠拟合现象，所以下面考虑多添加一些特征，使用多项式回归
#实现函数：返回X的1到p次幂
def ploy_features(x,power,as_ndarry=False):
    data={'f{}'.format(i):np.power(x,i) for i in range(1,power+1)}
    df=pd.DataFrame(data)
    return df.values if as_ndarry else df

In [23]:
data=sio.loadmat('/home/mw/input/andrew_ml_ex55139/ex5data1.mat')
X,y,Xval,yval,Xtest,ytest=map(np.ravel,[data['X'],data['y'],data['Xval'],data['yval'],data['Xtest'],data['ytest']])

In [24]:
ploy_features(X,power=3)

Unnamed: 0,f1,f2,f3
0,-15.936758,253.98026,-4047.621971
1,-29.152979,849.896197,-24777.006175
2,36.189549,1309.68343,47396.852168
3,37.492187,1405.664111,52701.422173
4,-48.058829,2309.651088,-110999.12775
5,-8.941458,79.94967,-714.866612
6,15.307793,234.328523,3587.0525
7,-34.706266,1204.524887,-41804.56089
8,1.389154,1.92975,2.68072
9,-44.38376,1969.918139,-87432.37359


In [25]:
def normalize_feature(df):
    return df.apply(lambda column:(column-column.mean())/column.std())

In [28]:
def prepare_poly_data(*args,power):
    def prepare(x):
        df=ploy_features(x,power=power)
        ndarr=normalize_feature(df).values
        return np.insert(ndarr,0,np.ones(ndarr.shape[0]),axis=1)
    return [prepare(x) for x in args]

In [30]:
X_poly, Xval_poly, Xtest_poly= prepare_poly_data(X, Xval, Xtest, power=8)
X_poly[:3, :]

array([[ 1.00000000e+00, -3.62140776e-01, -7.55086688e-01,
         1.82225876e-01, -7.06189908e-01,  3.06617917e-01,
        -5.90877673e-01,  3.44515797e-01, -5.08481165e-01],
       [ 1.00000000e+00, -8.03204845e-01,  1.25825266e-03,
        -2.47936991e-01, -3.27023420e-01,  9.33963187e-02,
        -4.35817606e-01,  2.55416116e-01, -4.48912493e-01],
       [ 1.00000000e+00,  1.37746700e+00,  5.84826715e-01,
         1.24976856e+00,  2.45311974e-01,  9.78359696e-01,
        -1.21556976e-02,  7.56568484e-01, -1.70352114e-01]])

In [36]:
def plot_learning_curve(X,Xinit,y,Xval,yval,lamda=0):
    training_cost,cv_cost=[],[]
    m=X.shape[0]
    for i in range(1,m+1):
        # 使用正则化拟合参数
        res=linear_regression(X[:i,:],y[:i],lamda=lamda)
        # 计算成本时，是在计算非正则化成本，正则化仅用于拟合参数
        tc = cost(res.x, X[:i, :], y[:i])
        cv = cost(res.x, Xval, yval)
        training_cost.append(tc)
        cv_cost.append(cv)
    fig, ax = plt.subplots(2,  1, figsize=(12, 12))
    ax[0].plot(np.arange(1, m + 1), training_cost, label='training cost')
    ax[0].plot(np.arange(1, m + 1), cv_cost, label='cv cost')
    ax[0].legend()
    fitx = np.linspace(-50, 50, 100) 
    fitxtmp = prepare_poly_data(fitx, power=8)
    fity = np.dot(prepare_poly_data(fitx, power=8)[0], linear_regression(X, y, lamda).x.T)
    ax[1].plot(fitx, fity, c='r', label='fitcurve')
    ax[1].scatter(Xinit, y, c='b', label='initial_Xy')
    ax[1].set_xlabel('water_level')
    ax[1].set_ylabel('flow')

In [37]:
plot_learning_curve(X_poly, X, y, Xval_poly, yval, lamda=0)
plt.show()

In [38]:
#由上图可知出现了过拟合现象
#下面调节正则化项系数lamda
plot_learning_curve(X_poly, X, y, Xval_poly, yval, lamda=1)
plt.show()

In [39]:
#由上图可知过拟合现象有极微小的改善，下面继续增大lamda的值
plot_learning_curve(X_poly, X, y, Xval_poly, yval, lamda=100)
plt.show()

In [40]:
#上图的lamda过大，出现了欠拟合现象
#下面尝试找出最佳的lamda
lamda_candidate = [0, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10]
training_cost, cv_cost = [], []

In [41]:
for l in lamda_candidate:
    res = linear_regression(X_poly, y, l)
    tc = cost(res.x, X_poly, y)
    cv = cost(res.x, Xval_poly, yval)
    training_cost.append(tc)
    cv_cost.append(cv)

In [43]:
fig, ax = plt.subplots(figsize=(12,8))
ax.plot(lamda_candidate, training_cost, label='training')
ax.plot(lamda_candidate, cv_cost, label='cross validation')
plt.legend()
plt.xlabel('lamda')
plt.ylabel('cost')
plt.show()

In [44]:
#由上图可知最佳lamda的值约为1
#计算测试集的误差
#把最终的模型用在一个从来没有在计算中出现过的测试集上，也就是说，需要既没有被用作选择，也没有被用作选择的数据
for l in lamda_candidate:
    theta = linear_regression(X_poly, y, l).x
    print('test cost(l={}) = {}'.format(l, cost(theta, Xtest_poly, ytest)))

test cost(l=0) = 10.055426362410126
test cost(l=0.001) = 11.001927632262907
test cost(l=0.003) = 11.26474655167747
test cost(l=0.01) = 10.880780731411715
test cost(l=0.03) = 10.022100517865269
test cost(l=0.1) = 8.63190793331871
test cost(l=0.3) = 7.3366077892272585
test cost(l=1) = 7.466283751156784
test cost(l=3) = 11.643941860536106
test cost(l=10) = 27.715080254176254


In [None]:
#由以上结果可知lamda的最佳值为0.3，此时的测试代价最小