In [165]:
import numpy as np

def get_data(filename):
    
    dataMatrix = []
    dataLabel = []
    with open(filename,'r+') as file:
        dataList = file.readlines()
        for data in dataList:
            data_tmp = data.strip().split()[:-1]
            data_tmp = [1] + data_tmp
            dataMatrix.append(data_tmp)
            dataLabel.append(data.strip().split()[-1])

    dataMatrix = np.array(dataMatrix,dtype = float)
    dataLabel = np.array(dataLabel,dtype = float)
    
    return dataMatrix,dataLabel

def model_linear_regression(dataMatrix,dataLabel):

    Y = dataLabel.reshape((len(dataLabel),1))
    
    data_dot = np.dot(dataMatrix.T,dataMatrix)
    if np.linalg.det(data_dot) == 0:
        print('np.dot(dataMatrix.T,dataMatrix) 不可逆')
        return 
    data_inv = np.linalg.inv(data_dot)

    W = np.dot(data_inv,np.dot(dataMatrix.T,Y))
    return W

def model_ridge_regression(dataMatrix,dataLabel,lambd = 1):

    Y = dataLabel.reshape((len(dataLabel),1))
    I_matrix = np.identity(dataMatrix.shape[1])
    data_dot = np.dot(dataMatrix.T,dataMatrix)
    data_dot = data_dot+lambd*I_matrix
    if np.linalg.det(data_dot) == 0:
        print('np.dot(dataMatrix.T,dataMatrix) 不可逆')
        return 
    data_inv = np.linalg.inv(data_dot)

    W = np.dot(data_inv,np.dot(dataMatrix.T,Y))
    return W

def predict(model,data):
    
    y_pre = np.dot(dataMatrix,model_lr)
    y_pre = np.squeeze(y_pre)
    
    return y_pre


def model_lw_linear_regression(testPoint,dataMatrix,dataLabel,k):
    
    Y = dataLabel.reshape((len(dataLabel),1))
    diffmat = testPoint- dataMatrix
    weights = np.exp(np.power(diffmat,2).sum(axis = 1)/(-2.0*k**2))
    weights = np.diag(weights)
    data_dot = np.dot(np.dot(dataMatrix.T,weights),dataMatrix)
    if np.linalg.det(data_dot) == 0.0:
        print('矩阵不可逆')
        return

    data_inv = np.linalg.inv(data_dot)
    W = np.dot(data_inv,
               np.dot(np.dot(dataMatrix.T,weights),
                      Y)
               )
    return W

def lw_predict(dataTest,dataMatrix,dataLabel,k=1):
    
    y_pre = []
    for i in range(0,dataTest.shape[0]):
        dataPoint = dataTest[i,:]
        W = model_lw_linear_regression(dataPoint,dataMatrix,dataLabel,k = k)
        y_pre_tmp = np.dot(dataPoint,W)
        y_pre.append(y_pre_tmp)
    
    y_pre = np.array(y_pre).squeeze()
    
    return y_pre


### 加载数据集

In [43]:

filename = 'data/abalone.txt'

dataMatrix,dataLabel = get_data(filename = filename)

print('预测变量：',dataMatrix.shape)
print('目标变量:',dataLabel.shape)


预测变量： (4177, 9)
目标变量: (4177,)


### 标准线性回归

In [72]:

model_lr = model_linear_regression(dataMatrix,dataLabel)
y_pre = predict(model_lr,dataMatrix)
mse = np.power(y_pre-dataLabel,2).mean()
print('训练集的均方误差为:',round(mse,3))


训练集的均方误差为: 4.907


### 局部加权线性回归

In [167]:

for k in [1,10,100]:
    y_pre = lw_predict(dataMatrix,dataMatrix,dataLabel,k = k)
    mse = np.power(y_pre-dataLabel,2).mean()
    print('k={0},训练集的均方误差为:{1}'.format(k,round(mse,3)))

    

k=1,训练集的均方误差为:4.664
k=10,训练集的均方误差为:4.904
k=100,训练集的均方误差为:4.906


### 岭回归

In [147]:

lambd_list = [0.01,0.1,1,10,100]
for lambd in lambd_list:
    model_lr = model_ridge_regression(dataMatrix,dataLabel,lambd = lambd)
    y_pre = predict(model_lr,dataMatrix)
    mse = np.power(y_pre-dataLabel,2).mean()
    print('lambd={0},训练集的均方误差为:{1}'.format(lambd,round(mse,3)))


lambd=0.01,训练集的均方误差为:4.907
lambd=0.1,训练集的均方误差为:4.907
lambd=1,训练集的均方误差为:4.935
lambd=10,训练集的均方误差为:5.336
lambd=100,训练集的均方误差为:6.709
