# Data Treating

Generally, there are five steps in data treating:

1. Read data;
2. Divide dataset;
3. Generate batch data
4. Shuffle dataset;
5. Test data validity

In [11]:
import paddle
from paddle.nn import Linear
import paddle.nn.functional as F
import os
import gzip
import json
import random
import numpy as np

## Read Data and Divide Dataset

In [12]:
datafile = './work/mnist.json.gz'
print('loading mnist dataset from {} ......'.format(datafile))
# load json data fiel
data = json.load(gzip.open(datafile))
print('mnist dataset load done')
# unpackage the data into training set, verifying set and testing set.
train_set, val_set, eval_set = data
print('The number of training data: ', len(train_set[0]))
print('The number of verifying data: ', len(val_set[0]))
print('The number of testing data: ', len(eval_set[0]))

loading mnist dataset from ./work/mnist.json.gz ......
mnist dataset load done
The number of training data:  50000
The number of verifying data:  10000
The number of testing data:  10000


## Generate Batch Data

In [13]:
imgs, labels = train_set[0], train_set[1]
imgs_length = len(imgs)
# define the index of every singel data
index_list = list(range(imgs_length))
# Shuffle the index of data
random.shuffle(index_list)

# define batch size
BATCHSIZE = 100

# @data generator:
def data_generator():
    imgs_list = []
    labels_list = []

    for i in index_list:
        img = np.array(imgs[i]).astype('float32')
        label = np.array(labels[i]).astype('float32')
        imgs_list.append(img)
        labels_list.append(label)
        if len(imgs_list) == BATCHSIZE:
            # get a dataset with length BATCHSIZE
            yield np.array(imgs_list), np.array(labels_list)
            # clear
            imgs_list = []
            labels_list = []

    if len(imgs_list) > 0:
        yield np.array(imgs_list), np.array(labels_list)

    return data_generator

train_loader = data_generator
# Read data in an iterative manner
for batch_id, data in enumerate(train_loader()):
    image_data, label_data = data
    if batch_id == 0:
        print('image dimension: {}, label dimension: {}'.format(image_data.shape, label_data.shape))

image dimension: (100, 784), label dimension: (100,)


## Test Data Validity

There are mainly two ways to test data validity:

1. Machine calibration;
2. Manual verification.

### Machine Calibration

In [14]:
assert len(imgs) == len(labels), \
    "length of train_imgs({}) should be the same as train_labels({})".format(len(imgs), len(labels))

## Encapsulation

In [15]:
# normalize images' data into [0, 1], and reshape them into [batch_size, w*h]
def norm_img(img):
    assert len(img.shape) == 3
    batch_size, img_h, img_w = img.shape
    img = img / 255
    img = paddle.reshape(img, [batch_size, img_h*img_w])

In [16]:
def load_data(mode = 'train'):
    # Read Data and Divide Dataset
    datafile = './work/mnist.json.gz'
    print('loading mnist dataset from {} ......'.format(datafile))
    ## load json data fiel
    data = json.load(gzip.open(datafile))
    print('mnist dataset load done')
    ## unpackage the data into training set, verifying set and testing set.
    train_set, val_set, eval_set = data

    if mode == 'train':
        imgs, labels = train_set[0], train_set[1]
    elif mode == 'valid':
        imgs, labels = val_set[0], val_set[1]
    elif mode == 'eval':
        imgs, labels = eval_set[0], eval_set[1]
    else:
        raise Exception('mode can only be one of [\'train\', \'valid\', \'eval\']')
    print('The number of {} data: {}'.format(mode, len(imgs)))

    # Test Data Validity
    imgs_length = len(imgs)

    assert len(imgs) == len(labels), \
        "length of train_imgs({}) should be the same as train_labels({})".format(len(imgs), len(labels))

    # Generate batch data
    # define the index of every singel data
    index_list = list(range(imgs_length))
    # Shuffle the index of data
    random.shuffle(index_list)
    
    # define batch size
    BATCH_SIZE = 100
    
    # @data generator:
    def data_generator():
        imgs_list = []
        labels_list = []
    
        for i in index_list:
            img = np.array(imgs[i]).astype('float32')
            # norm_img(img)
            label = np.array(labels[i]).astype('float32')
            imgs_list.append(img)
            labels_list.append(label)
            if len(imgs_list) == BATCH_SIZE:
                # get a dataset with length BATCHSIZE
                yield np.array(imgs_list), np.array(labels_list)
                # clear
                imgs_list = []
                labels_list = []
    
        if len(imgs_list) > 0:
            yield np.array(imgs_list), np.array(labels_list)

    return data_generator

In [17]:
# Net structure of MNIST
class MNIST(paddle.nn.Layer):
    def __init__(self):
        super(MNIST, self).__init__()

        # define one fully connected layer, 
        # which has 13 input dimensions and 1 output dimensions
        self.fc = Linear(in_features = 784, out_features = 1)
    
    def forward(self, inputs):
        x = self.fc(inputs)
        return x

In [18]:
# train model
def train(model):
    model.train()
    # load data
    train_loader = load_data('train')
    opt = paddle.optimizer.SGD(learning_rate = 0.001, parameters = model.parameters())
    
    EPOCH_NUM = 10

    for epoch_id in range(EPOCH_NUM):
        for batch_id, data in enumerate(train_loader()):
            # 1. prepare data
            images, labels = data
            images = paddle.to_tensor(images)
            labels = paddle.to_tensor(labels)
            # 2. Forward calculation
            predicts = model(images)
            # 3. Calculate losses
            loss = F.square_error_cost(predicts, labels)
            avg_loss = paddle.mean(loss)
            # print loss every 200 batches of data
            if batch_id % 200 == 0:
                print("epoch: {} / batch: {}, loss = {}".format(epoch_id, batch_id, float(avg_loss)))
            
            # 4. Backpropagation
            avg_loss.backward()
            opt.step()
            opt.clear_grad()

    # save model
    paddle.save(model.state_dict(), './mnist.pdparams')

In [19]:
model = MNIST()
train(model)

loading mnist dataset from ./work/mnist.json.gz ......
mnist dataset load done
The number of train data: 50000
epoch: 0 / batch: 0, loss = 20.97543716430664
epoch: 0 / batch: 200, loss = 7.980806827545166
epoch: 0 / batch: 400, loss = 8.513910293579102
epoch: 1 / batch: 0, loss = 8.79451847076416
epoch: 1 / batch: 200, loss = 7.934875011444092
epoch: 1 / batch: 400, loss = 8.479248046875
epoch: 2 / batch: 0, loss = 8.771469116210938
epoch: 2 / batch: 200, loss = 7.923612594604492
epoch: 2 / batch: 400, loss = 8.465299606323242
epoch: 3 / batch: 0, loss = 8.762093544006348
epoch: 3 / batch: 200, loss = 7.918120384216309
epoch: 3 / batch: 400, loss = 8.457737922668457
epoch: 4 / batch: 0, loss = 8.756901741027832
epoch: 4 / batch: 200, loss = 7.914795398712158
epoch: 4 / batch: 400, loss = 8.452914237976074
epoch: 5 / batch: 0, loss = 8.753449440002441
epoch: 5 / batch: 200, loss = 7.912471294403076
epoch: 5 / batch: 400, loss = 8.449481964111328
epoch: 6 / batch: 0, loss = 8.75089359283

## Asynchronous Data Treating

In [None]:
# TODO: