In [1]:
#!/usr/bin/env python
# -*- coding:utf-8 -*-
################################################################################
#
# Copyright (c) 2017 Baidu.com, Inc. All Rights Reserved
#
################################################################################
"""
Authors: yin xiaoting(y_tink@163.com)
Date:    2017/11/16

使用paddlepaddle来做线性回归，拟合房屋价格与房屋面积的线性关系，具体步骤如下：
1.载入数据和预处理：load_data()
2.定义两个reader()分别用于读取训练数据和测试数据
3.初始化
4.配置网络结构
5.定义成本函数cost
6.定义优化器optimizer
7.定义trainer并开始训练，获得训练结果参数a，b
"""
import numpy as np
import paddle.v2 as paddle
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

CODEMASTER_TRAIN_DATA = None
X_RAW = None
CODEMASTER_TEST_DATA = None



In [2]:

def load_data(filename, feature_num=2, ratio=0.8):
    """
    载入数据并进行数据预处理

    Args:
        filename -- 数据存储文件，从该文件读取数据
        feature_num -- 数据特征数量
        ratio -- 训练集占总数据集比例
    Return:
    """
    #如果测试数据集和训练数据集都不为空，就不再载入数据load_data
    global CODEMASTER_TRAIN_DATA, CODEMASTER_TEST_DATA, X_RAW
    if CODEMASTER_TRAIN_DATA is not None and CODEMASTER_TEST_DATA is not None:
        return
    #data = np.loadtxt()表示将数据载入后以矩阵或向量的形式存储在data中
    #delimiter=',' 表示以','为分隔符
    data = np.loadtxt(filename, delimiter=',')
    X_RAW = data.T[0].copy()
    #axis=0 表示按列计算
    #data.shape[0]表示data中一共多少列
    maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum(
        axis=0) / data.shape[0]
    #归一化，data[:, i] 表示第i列的元素
    for i in xrange(feature_num - 1):
        data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i])
    #offset用于划分训练数据集和测试数据集，例如0.8表示训练集占80%
    offset = int(data.shape[0] * ratio)
    CODEMASTER_TRAIN_DATA = data[:offset].copy()
    CODEMASTER_TEST_DATA = data[offset:].copy()


In [3]:


def train():
    """
    定义一个reader来获取训练数据集及其标签：x，y

    Args:
    Return:
        reader -- 用于获取训练数据集及其标签的reader
    """
    global CODEMASTER_TRAIN_DATA
    load_data("data.txt")

    #yield作用同return，但是返回的是生成器(generator)，生成器只能调用一次，实时计算
    def reader():
        """
            一个reader
            Args:
            Return:
                data[:-1], data[-1:] -- 使用yield返回生成器(generator)，
                        data[:-1]表示前n-1个元素，也就是训练数据，data[-1:]表示最后一个元素，也就是对应的标签
            """
        for d in CODEMASTER_TRAIN_DATA:
            yield d[:-1], d[-1:]

    return reader

for i in train()():
    print(i)
    
    
    
    

(array([ 0.0264306]), array([ 599.]))
(array([-0.16203503]), array([ 450.]))
(array([-0.03380584]), array([ 440.]))
(array([ 0.21608469]), array([ 780.]))
(array([-0.20644611]), array([ 450.]))
(array([-0.1291333]), array([ 315.]))
(array([ 0.18405867]), array([ 998.]))
(array([-0.18542904]), array([ 435.]))
(array([-0.23221705]), array([ 435.]))
(array([-0.21407731]), array([ 225.]))
(array([ 0.02586764]), array([ 685.]))
(array([-0.19606268]), array([ 320.]))
(array([ 0.00097241]), array([ 568.]))
(array([-0.27956803]), array([ 365.]))
(array([-0.06182861]), array([ 530.]))
(array([ 0.20751523]), array([ 720.]))
(array([-0.05913893]), array([ 709.]))
(array([-0.09216576]), array([ 510.]))
(array([-0.01929405]), array([ 600.]))
(array([-0.22295953]), array([ 300.]))
(array([ 0.00985463]), array([ 580.]))
(array([ 0.00028435]), array([ 380.]))
(array([-0.31053068]), array([ 210.]))
(array([-0.16666379]), array([ 320.]))
(array([-0.10805367]), array([ 630.]))
(array([ 0.05851917]), arra

(array([ 0.07728442]), array([ 940.]))
(array([ 0.09248427]), array([ 880.]))
(array([-0.01697967]), array([ 616.]))
(array([-0.17485795]), array([ 296.]))
(array([-0.23478164]), array([ 280.]))
(array([ 0.02586764]), array([ 590.]))
(array([-0.34124314]), array([ 275.]))
(array([ 0.31410183]), array([ 850.]))
(array([ 0.1022422]), array([ 600.]))
(array([ 0.05307725]), array([ 565.]))
(array([-0.04781723]), array([ 565.]))
(array([-0.24797986]), array([ 347.]))
(array([-0.25579871]), array([ 312.]))
(array([-0.20194245]), array([ 480.]))
(array([ 0.25492875]), array([ 710.]))
(array([-0.12775718]), array([ 450.]))
(array([ 0.01066779]), array([ 400.]))
(array([-0.03574492]), array([ 542.]))
(array([ 0.02098868]), array([ 598.]))
(array([-0.17742253]), array([ 530.]))
(array([ 0.15359642]), array([ 950.]))
(array([-0.17291887]), array([ 415.]))
(array([-0.16103422]), array([ 430.]))
(array([-0.18336486]), array([ 400.]))
(array([-0.24797986]), array([ 350.]))
(array([ 0.33374279]), arr

(array([ 0.65118821]), array([ 1550.]))
(array([-0.21094977]), array([ 300.]))
(array([-0.07002277]), array([ 540.]))
(array([-0.23303021]), array([ 248.]))
(array([-0.16366135]), array([ 450.]))
(array([ 0.2890815]), array([ 750.]))
(array([ 0.32123263]), array([ 1150.]))
(array([-0.21964434]), array([ 430.]))
(array([-0.10949234]), array([ 350.]))
(array([-0.24797986]), array([ 360.]))
(array([-0.09066454]), array([ 615.]))
(array([ 0.14984337]), array([ 660.]))
(array([-0.19606268]), array([ 330.]))
(array([-0.21964434]), array([ 350.]))
(array([-0.21889373]), array([ 325.]))
(array([ 0.13101557]), array([ 470.]))
(array([-0.14789854]), array([ 460.]))
(array([ 0.11481491]), array([ 500.]))
(array([ 0.00034691]), array([ 650.]))
(array([-0.09691962]), array([ 460.]))
(array([ 0.34650316]), array([ 975.]))
(array([ 0.36332933]), array([ 1250.]))
(array([-0.29802052]), array([ 230.]))
(array([ 0.32542353]), array([ 680.]))
(array([ 0.56330431]), array([ 980.]))
(array([-0.21632914]), 

In [4]:

def test():
    """
    定义一个reader来获取测试数据集及其标签：x，y

    Args:
    Return:
        reader -- 用于获取测试数据集及其标签的reader
    """
    global CODEMASTER_TEST_DATA
    load_data("data.txt")

    def reader():
        """
            一个reader
            Args:
            Return:
                data[:-1], data[-1:] -- 使用yield返回生成器(generator)，
                        data[:-1]表示前n-1个元素，也就是测试数据，data[-1:]表示最后一个元素，也就是对应的标签
            """
        for d in CODEMASTER_TEST_DATA:
            yield d[:-1], d[-1:]

    return reader


In [5]:


# 展示模型训练曲线
def plot_costs(costs):
    """
    利用costs展示模型的训练曲线

    Args:
        costs -- 记录了训练过程的cost变化的list，每一百次迭代记录一次
    Return:
    """
    costs = np.squeeze(costs)
    plt.plot(costs)
    plt.ylabel('cost')
    plt.xlabel('iterations (per hundreds)')
    plt.title("House Price Distributions of Beijing Beiyuan Area")
    plt.show()
    plt.savefig('costs.png')


In [6]:

# event_handler to print training and testing info
def event_handler(event):
    """
    事件处理器，可以根据训练过程的信息作相应操作

    Args:
        event -- 事件对象，包含event.pass_id, event.batch_id, event.cost等信息
    Return:
    """
    if isinstance(event, paddle.event.EndIteration):
        if event.pass_id % 100 == 0:
            print "Pass %d, Batch %d, Cost %f" % (
                event.pass_id, event.batch_id, event.cost)
            costs.append(event.cost)

    if isinstance(event, paddle.event.EndPass):
        result = trainer.test(
            reader=paddle.batch(test(), batch_size=2),
            feeding=feeding)
        print "Test %d, Cost %f" % (event.pass_id, result.cost)


In [7]:



def main():
    """
    初始化，定义神经网络结构，训练
    Args:
    Return:
    """
    # init
    paddle.init(use_gpu=False, trainer_count=1)

    # network config
    x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(1))
    y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
    y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
    cost = paddle.layer.mse_cost(input=y_predict, label=y)

    # create parameters
    parameters = paddle.parameters.create(cost)

    # create optimizer
    optimizer = paddle.optimizer.Momentum(momentum=0)

    # stochastic gradient descent
    trainer = paddle.trainer.SGD(
        cost=cost, parameters=parameters, update_equation=optimizer)

    # mapping data
    feeding = {'x': 0, 'y': 1}

    # 记录cost
    costs = []
    
    # training
    readerTemp = train()
    
    print readerTemp
    for r in readerTemp():
        print r
    myReader = paddle.reader.shuffle(
        readerTemp, 
        buf_size=50)
    
    myBatch = paddle.batch(
            myReader,
            batch_size=2)
    
    
    trainer.train(
        reader=myBatch,
        feeding=feeding,
        #event_handler=event_handler,
        num_passes=300)

    # print result parameter
    print("Result Parameters as below:")
    a = parameters.get('___fc_layer_0__.w0')[0]
    b = parameters.get('___fc_layer_0__.wbias')[0]
    print(a, b)

    x0 = X_RAW[0]
    y0 = a * CODEMASTER_TRAIN_DATA[0][0] + b

    x1 = X_RAW[1]
    y1 = a * CODEMASTER_TRAIN_DATA[1][0] + b

    a = (y0 - y1) / (x0 - x1)
    b = (y1 - a * x1)

    print 'a = ', a
    print 'b = ', b

    plot_costs(costs)
if __name__ == '__main__':
    main()


[INFO 2017-11-29 07:55:30,439 networks.py:1482] The input order is [x, y]
[INFO 2017-11-29 07:55:30,444 networks.py:1488] The output order is [__mse_cost_0__]
[INFO 2017-11-29 07:55:30,454 networks.py:1482] The input order is [x, y]
[INFO 2017-11-29 07:55:30,459 networks.py:1488] The output order is [__mse_cost_0__]


<function reader at 0x7f542811d0c8>
(array([ 0.0264306]), array([ 599.]))
(array([-0.16203503]), array([ 450.]))
(array([-0.03380584]), array([ 440.]))
(array([ 0.21608469]), array([ 780.]))
(array([-0.20644611]), array([ 450.]))
(array([-0.1291333]), array([ 315.]))
(array([ 0.18405867]), array([ 998.]))
(array([-0.18542904]), array([ 435.]))
(array([-0.23221705]), array([ 435.]))
(array([-0.21407731]), array([ 225.]))
(array([ 0.02586764]), array([ 685.]))
(array([-0.19606268]), array([ 320.]))
(array([ 0.00097241]), array([ 568.]))
(array([-0.27956803]), array([ 365.]))
(array([-0.06182861]), array([ 530.]))
(array([ 0.20751523]), array([ 720.]))
(array([-0.05913893]), array([ 709.]))
(array([-0.09216576]), array([ 510.]))
(array([-0.01929405]), array([ 600.]))
(array([-0.22295953]), array([ 300.]))
(array([ 0.00985463]), array([ 580.]))
(array([ 0.00028435]), array([ 380.]))
(array([-0.31053068]), array([ 210.]))
(array([-0.16666379]), array([ 320.]))
(array([-0.10805367]), array([

(array([ 0.48186314]), array([ 1000.]))
(array([ 0.06252242]), array([ 720.]))
(array([-0.26386777]), array([ 560.]))
(array([-0.03518196]), array([ 420.]))
(array([ 0.44702233]), array([ 1160.]))
(array([-0.17160531]), array([ 387.]))
(array([-0.14433315]), array([ 420.]))
(array([-0.16622594]), array([ 265.]))
(array([ 0.4713546]), array([ 1395.]))
(array([-0.17291887]), array([ 455.]))
(array([-0.13226084]), array([ 480.]))
(array([ 0.2244665]), array([ 760.]))
(array([ 0.00872872]), array([ 570.]))
(array([-0.13182298]), array([ 265.]))
(array([ 0.21239419]), array([ 750.]))
(array([ 0.20294902]), array([ 870.]))
(array([-0.14789854]), array([ 450.]))
(array([-0.24529017]), array([ 230.]))
(array([ 0.05226409]), array([ 560.]))
(array([ 0.46509952]), array([ 1600.]))
(array([ 0.08166297]), array([ 815.]))
(array([ 0.3688338]), array([ 770.]))
(array([-0.14289448]), array([ 450.]))
(array([-0.04306337]), array([ 568.]))
(array([ 0.17111065]), array([ 970.]))
(array([-0.2245233]), ar

(array([-0.19606268]), array([ 252.]))
(array([-0.0916028]), array([ 560.]))
(array([ 0.1824949]), array([ 600.]))
(array([-0.04587815]), array([ 405.]))
(array([ 0.19950872]), array([ 540.]))
(array([ 0.23879064]), array([ 800.]))
(array([ 0.36332933]), array([ 1280.]))
(array([ 0.08979458]), array([ 650.]))
(array([ 0.05226409]), array([ 845.]))
(array([-0.21908138]), array([ 420.]))
(array([-0.17742253]), array([ 525.]))
(array([ 0.01054269]), array([ 400.]))
(array([-0.05225834]), array([ 350.]))
(array([-0.099234]), array([ 308.]))
(array([ 0.26718871]), array([ 1100.]))
(array([ 0.15666141]), array([ 530.]))
(array([ 0.19663139]), array([ 750.]))
(array([-0.13013411]), array([ 490.]))
(array([-0.13538838]), array([ 635.]))
(array([-0.17154275]), array([ 400.]))
(array([-0.04168725]), array([ 435.]))
(array([-0.16441196]), array([ 270.]))
(array([-0.13538838]), array([ 415.]))
(array([-0.00403165]), array([ 600.]))
(array([ 0.07772227]), array([ 800.]))
(array([-0.28551035]), arra