# Discussion

The Boston house price prediction task in the first chapter is a regression task whose result is a continuous value, so we can use mean squared error as a loss function. However, the result of the classification task is discrete labeling, so it is unreasonable to use mean squared error as a loss function.

## Softmax Function

$$
Softmax(x_i) = \frac{e^{x_i}}{\sum\limits_{j=0}^{N}{e^{x_j}}}
$$

As can be seen from the formula, the range of each output is between 0~1, and the sum of all outputs is equal to 1.

## Cross-entropy

People are used to using cross-entropy as a loss function for classification problems.

Firstly, "one-bit valid encoding" for categories:

$$
\begin{array}
\mathbf{y} =& [y_1, y_2, \dots, y_n]^T \\
y_i =& \left\{ \begin{array}\
    & 1\ & \mathrm{if}\ i = y \\
    & 0\ & \mathrm{otherwise}
    \end{array} \right.
\end{array}
$$

The maximum value is the forecast:

$$
\hat{y} = \mathop{\mathrm{argmax}}\limits_i o_i
$$

Cross-entropy is often used to measure the difference between two probabilities:

$$
H(\mathbf{p},\mathbf{q}) = -\sum\limits_i{p_i\log{q_i}}
$$

Use it as loss function:

$$
l(y, \hat{y}) = -\sum\limits_i y_i\log{\hat{y}_i} = -\log{\hat{y}_y}
$$

Its gradient is the difference between true probability and predicted probability:

$$
\partial_{o_i} l(y, \hat{y}) = \mathrm{softmax}(\mathbf{o})_i - y_i
$$

In [33]:
import paddle
from paddle.nn import Conv2D, MaxPool2D, Linear
from PIL import Image
import paddle.nn.functional as F
import os
import gzip
import json
import random
import numpy as np

In [4]:
def load_data(mode = 'train'):
    # Read Data and Divide Dataset
    datafile = './work/mnist.json.gz'
    print('loading mnist dataset from {} ......'.format(datafile))
    ## load json data fiel
    data = json.load(gzip.open(datafile))
    print('mnist dataset load done')
    ## unpackage the data into training set, verifying set and testing set.
    train_set, val_set, eval_set = data

    if mode == 'train':
        imgs, labels = train_set[0], train_set[1]
    elif mode == 'valid':
        imgs, labels = val_set[0], val_set[1]
    elif mode == 'eval':
        imgs, labels = eval_set[0], eval_set[1]
    else:
        raise Exception('mode can only be one of [\'train\', \'valid\', \'eval\']')
    print('The number of {} data: {}'.format(mode, len(imgs)))

    # Test Data Validity
    imgs_length = len(imgs)

    assert len(imgs) == len(labels), \
        "length of train_imgs({}) should be the same as train_labels({})".format(len(imgs), len(labels))

    # Generate batch data
    # define the index of every singel data
    index_list = list(range(imgs_length))
    # Shuffle the index of data
    random.shuffle(index_list)
    
    # define batch size
    BATCH_SIZE = 100
    
    # @data generator:
    def data_generator():
        imgs_list = []
        labels_list = []
    
        for i in index_list:
            img = np.array(imgs[i]).astype('float32')
            # norm_img(img)
            label = np.array(labels[i]).astype('int64')
            imgs_list.append(img)
            labels_list.append(label)
            if len(imgs_list) == BATCH_SIZE:
                # get a dataset with length BATCHSIZE
                yield np.array(imgs_list), np.array(labels_list)
                # clear
                imgs_list = []
                labels_list = []
    
        if len(imgs_list) > 0:
            yield np.array(imgs_list), np.array(labels_list)

    return data_generator

In [5]:
# train model with cross-entropy
def train(model):
    model.train()
    # load data
    train_loader = load_data('train')
    opt = paddle.optimizer.SGD(learning_rate = 0.001, parameters = model.parameters())
    
    EPOCH_NUM = 10

    for epoch_id in range(EPOCH_NUM):
        for batch_id, data in enumerate(train_loader()):
            # 1. prepare data
            images, labels = data
            images = paddle.to_tensor(images)
            labels = paddle.to_tensor(labels)
            # 2. Forward calculation
            predicts = model(images)
            # 3. Calculate losses with cross-entropy
            loss = F.cross_entropy(predicts, labels)
            avg_loss = paddle.mean(loss)
            # print loss every 200 batches of data
            if batch_id % 200 == 0:
                print("epoch: {} / batch: {}, loss = {}".format(epoch_id, batch_id, float(avg_loss)))
            
            # 4. Backpropagation
            avg_loss.backward()
            opt.step()
            opt.clear_grad()

    # save model
    paddle.save(model.state_dict(), './mnist.pdparams')

In [6]:
def evaluation(model, datasets):
    model.eval()

    acc_set = list()
    for batch_id, data in enumerate(datasets()):
        # 1. prepare data
        images, labels = data
        images = paddle.to_tensor(images)
        labels = paddle.to_tensor(labels)
        # 2. Forward calculation
        pred = model(images)

        acc = paddle.metric.accuracy(input = pred, label = labels)
        acc_set.extend(acc.numpy())

    # calculation accuracy
    acc_val_mean = np.array(acc_set).mean()
    return acc_val_mean

In [7]:
class MNIST(paddle.nn.Layer):
    def __init__(self):
        super(MNIST, self).__init__()

        # Define the convolutional layer 1 with 
        # the output feature channel (out_channels) set to 20, 
        # the convolution kernel size (kernel_size) to 5, 
        # the convolution step size stride to 1, and padding to 2
        self.conv1 = Conv2D(in_channels = 1, out_channels = 20, kernel_size = 5, stride = 1, padding = 2)
        # Define the pooling layer 1 with  
        # the size of the pooling kernel (kernel_size) to 2, 
        # the pooling step to 2
        self.max_pool1 = MaxPool2D(kernel_size = 2, stride = 2)
        # Define the convolutional layer 2
        self.conv2 = Conv2D(in_channels = 20, out_channels = 20, kernel_size = 5, stride = 1, padding = 2)
        # Define the pooling layer 2
        self.max_pool2 = MaxPool2D(kernel_size = 2, stride = 2)
        # Define a fully connected layer
        self.fc = Linear(in_features = 980, out_features = 10)

    # define farword calculation function, the activation function of hidden layers is ReLU
    def forward(self, inputs):
        inputs = paddle.reshape(inputs, [inputs.shape[0], 1, 28, 28])
        x = self.conv1(inputs)
        x = F.relu(x)
        x = self.max_pool1(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = self.max_pool2(x)
        x = paddle.reshape(x, [x.shape[0], -1])
        x = self.fc(x)
        return x

In [20]:
def load_image(img_path):
    im = Image.open(img_path).convert('L')
    im = im.resize((28,28), Image.ANTIALIAS)
    im = np.array(im).reshape(1,1,28,28).astype(np.float32)
    # normalize image
    im = 1.0 - im / 255.0
    return im

In [21]:
model = MNIST()

In [59]:
# train model
train(model)

loading mnist dataset from ./work/mnist.json.gz ......
mnist dataset load done
The number of train data: 50000
epoch: 0 / batch: 0, loss = 3.6161539554595947
epoch: 0 / batch: 200, loss = 1.2341917753219604
epoch: 0 / batch: 400, loss = 0.8244312405586243
epoch: 1 / batch: 0, loss = 0.7366979718208313
epoch: 1 / batch: 200, loss = 0.447758287191391
epoch: 1 / batch: 400, loss = 0.508483350276947
epoch: 2 / batch: 0, loss = 0.4283043146133423
epoch: 2 / batch: 200, loss = 0.29235202074050903
epoch: 2 / batch: 400, loss = 0.4047854542732239
epoch: 3 / batch: 0, loss = 0.31942665576934814
epoch: 3 / batch: 200, loss = 0.23105426132678986
epoch: 3 / batch: 400, loss = 0.3476784825325012
epoch: 4 / batch: 0, loss = 0.2622949182987213
epoch: 4 / batch: 200, loss = 0.1972339004278183
epoch: 4 / batch: 400, loss = 0.30950772762298584
epoch: 5 / batch: 0, loss = 0.22579124569892883
epoch: 5 / batch: 200, loss = 0.17410260438919067
epoch: 5 / batch: 400, loss = 0.2815054953098297
epoch: 6 / batc

In [31]:
params_file_path = 'mnist.pdparams'
img_path = 'work/9.jpg'
# load model
param_dict = paddle.load(params_file_path)
model.load_dict(param_dict)
# load data
model.eval()
tensor_img = load_image(img_path)
results = model(paddle.to_tensor(tensor_img))
lab = np.argsort(results.numpy())
print('The result of this prediction is ', lab[0])

The result of this prediction is  [7 0 9 4 1 2 6 5 3 8]
