# 从疝气病症预测病马的死亡率

**说明:**

将 `horseColicTraining.txt` 和 `horseColicTest.txt` 放在当前目录下。

In [4]:
import numpy as np
import random

def get_data(filename):
    dataMatrix=[]
    dataLabel=[]
    with open(filename,'r+') as file:
        DataList=file.readlines()
        for data in DataList:
            data_tmp=data.strip().split()
            dataMatrix.append([1]+list(map(float,data_tmp[:-1])))
            dataLabel.append(float(data_tmp[-1]))
    dataMatrix=np.array(dataMatrix)
    dataLabel=np.array(dataLabel).reshape(len(dataLabel),1)
    
    return (dataMatrix,dataLabel)    

def sigmoid(x):
    
    result=1/(1+np.exp(-x))
    
    return result


def get_weight_gradAscent(dataMatrix,dataLabel,learning_rate=0.001,iters=200):
    
    (n_sample,n_features)=dataMatrix.shape
    weights=np.ones((n_features,1))
    for i in range(iters):
        z=np.dot(dataMatrix,weights)
        y_pre=sigmoid(z)
        error=dataLabel-y_pre
        weights=weights+learning_rate*np.dot(trainMatrix.T,error)/n_sample
   
    return weights

def get_weight_stocGradAscent(dataMatrix,dataLabel,learning_rate=4,iters=200):
    
    (n_sample,n_features)=dataMatrix.shape
    weights=np.ones((n_features,1))
    for i in range(iters):
        idx=list(range(n_sample))
        for j in range(n_sample):
            learning_rate=learning_rate/(1+i+j)+0.01
            randomIdx=int(random.uniform(0,len(idx)))
            trainSample=trainMatrix[randomIdx]
            trainSample=trainSample.reshape(1,len(trainSample))
            labelSample=trainLabel[randomIdx]
            labelSample=labelSample.reshape(1,1)
            z=np.dot(trainSample,weights)
            y_pre=sigmoid(z)
            error=labelSample-y_pre
            weights=weights+learning_rate*np.dot(trainSample.T,error)
            del(idx[randomIdx])
            
    return weights     

def predict(weights,dataMatrix,output_type='label'):
    
    z=np.dot(dataMatrix,weights)
    y_prob=sigmoid(z)
    y_label=(y_prob>=0.5).astype(float)
    
    if output_type=='label':
        return y_label
    else:
        return y_prob

### 读取数据

In [5]:
filename='data/horseColicTraining.txt'
(trainMatrix,trainLabel)=get_data(filename)

filename='data/horseColicTest.txt'
(testMatrix,testLabel)=get_data(filename)

print('训练集:',trainMatrix.shape)
print('训练集标签:',trainLabel.shape)

print('测试集:',testMatrix.shape)
print('测试集标签:',testLabel.shape)

训练集: (299, 22)
训练集标签: (299, 1)
测试集: (67, 22)
测试集标签: (67, 1)


### 梯度上升

In [185]:
learning_rate=[0.0001,0.001,0.01,0.1]
iters=[20,200,2000,20000]
for i in learning_rate:
    for j in iters:
        weights=get_weight_gradAscent(trainMatrix,trainLabel,learning_rate=i,iters=j)
        y_pre_label_train=predict(weights,trainMatrix)
        y_pre_label_test=predict(weights,testMatrix)
        train_error_rate=1-sum(trainLabel==y_pre_label_train)/y_pre_label_train.shape[0]
        test_error_rate=1-sum(testLabel==y_pre_label_test)/y_pre_label_test.shape[0]
        train_error_rate=round(train_error_rate[0],3)
        test_error_rate=round(test_error_rate[0],3)
        str_output="learning_rate={0}，iters={1}，训练集error_rate={2},测试集error_rate:{3}".\
        format(i,j,train_error_rate,test_error_rate)
        print(str_output)

learning_rate=0.0001，iters=20，训练集error_rate=0.405,测试集error_rate:0.299
learning_rate=0.0001，iters=200，训练集error_rate=0.405,测试集error_rate:0.299
learning_rate=0.0001，iters=2000，训练集error_rate=0.385,测试集error_rate:0.373
learning_rate=0.0001，iters=20000，训练集error_rate=0.425,测试集error_rate:0.388
learning_rate=0.001，iters=20，训练集error_rate=0.405,测试集error_rate:0.299
learning_rate=0.001，iters=200，训练集error_rate=0.385,测试集error_rate:0.373
learning_rate=0.001，iters=2000，训练集error_rate=0.425,测试集error_rate:0.388
learning_rate=0.001，iters=20000，训练集error_rate=0.274,测试集error_rate:0.239
learning_rate=0.01，iters=20，训练集error_rate=0.408,测试集error_rate:0.493
learning_rate=0.01，iters=200，训练集error_rate=0.468,测试集error_rate:0.493
learning_rate=0.01，iters=2000，训练集error_rate=0.411,测试集error_rate:0.507
learning_rate=0.01，iters=20000，训练集error_rate=0.351,测试集error_rate:0.448
learning_rate=0.1，iters=20，训练集error_rate=0.395,测试集error_rate:0.299
learning_rate=0.1，iters=200，训练集error_rate=0.361,测试集error_rate:0.313
learning_rate=0.1，i

### 随机梯度上升

In [6]:
learning_rate=[3,4,5]
iters=[20,200,2000,20000]
for i in learning_rate:
    for j in iters:
        weights=get_weight_stocGradAscent(trainMatrix,trainLabel,learning_rate=i,iters=j)
        y_pre_label_train=predict(weights,trainMatrix)
        y_pre_label_test=predict(weights,testMatrix)
        train_error_rate=1-sum(trainLabel==y_pre_label_train)/y_pre_label_train.shape[0]
        test_error_rate=1-sum(testLabel==y_pre_label_test)/y_pre_label_test.shape[0]
        train_error_rate=round(train_error_rate[0],3)
        test_error_rate=round(test_error_rate[0],3)
        str_output="learning_rate={0}，iters={1}，训练集error_rate={2},测试集error_rate:{3}".\
        format(i,j,train_error_rate,test_error_rate)
        print(str_output)



learning_rate=3，iters=20，训练集error_rate=0.338,测试集error_rate:0.403
learning_rate=3，iters=200，训练集error_rate=0.291,测试集error_rate:0.388
learning_rate=3，iters=2000，训练集error_rate=0.361,测试集error_rate:0.373
learning_rate=3，iters=20000，训练集error_rate=0.368,测试集error_rate:0.373
learning_rate=4，iters=20，训练集error_rate=0.294,测试集error_rate:0.299
learning_rate=4，iters=200，训练集error_rate=0.548,测试集error_rate:0.701
learning_rate=4，iters=2000，训练集error_rate=0.321,测试集error_rate:0.373
learning_rate=4，iters=20000，训练集error_rate=0.301,测试集error_rate:0.358
learning_rate=5，iters=20，训练集error_rate=0.415,测试集error_rate:0.463
learning_rate=5，iters=200，训练集error_rate=0.318,测试集error_rate:0.328
learning_rate=5，iters=2000，训练集error_rate=0.378,测试集error_rate:0.418
learning_rate=5，iters=20000，训练集error_rate=0.344,测试集error_rate:0.388
