# Discussion

The Boston house price prediction task in the first chapter is a regression task whose result is a continuous value, so we can use mean squared error as a loss function. However, the result of the classification task is discrete labeling, so it is unreasonable to use mean squared error as a loss function.

## Softmax Function

$$
Softmax(x_i) = \frac{e^{x_i}}{\sum\limits_{j=0}^{N}{e^{x_j}}}
$$

As can be seen from the formula, the range of each output is between 0~1, and the sum of all outputs is equal to 1.

## Cross-entropy

People are used to using cross-entropy as a loss function for classification problems.

Firstly, "one-bit valid encoding" for categories:

$$
\begin{array}
\mathbf{y} =& [y_1, y_2, \dots, y_n]^T \\
y_i =& \left\{ \begin{array}\
    & 1\ & \mathrm{if}\ i = y \\
    & 0\ & \mathrm{otherwise}
    \end{array} \right.
\end{array}
$$

The maximum value is the forecast:

$$
\hat{y} = \mathop{\mathrm{argmax}}\limits_i o_i
$$

Cross-entropy is often used to measure the difference between two probabilities:

$$
H(\mathbf{p},\mathbf{q}) = -\sum\limits_i{p_i\log{q_i}}
$$

Use it as loss function:

$$
l(y, \hat{y}) = -\sum\limits_i y_i\log{\hat{y}_i} = -\log{\hat{y}_y}
$$

Its gradient is the difference between true probability and predicted probability:

$$
\partial_{o_i} l(y, \hat{y}) = \mathrm{softmax}(\mathbf{o})_i - y_i
$$

In [240]:
import paddle
from paddle.nn import Conv2D, MaxPool2D, Linear
from PIL import Image
import paddle.nn.functional as F
import os
import gzip
import json
import random
import numpy as np

In [241]:
params_files_path = 'mnist.pdparams'

In [262]:
class MNIST(paddle.nn.Layer):
    def __init__(self):
        super(MNIST, self).__init__()

        # Define the convolutional layer 1 with 
        # the output feature channel (out_channels) set to 20, 
        # the convolution kernel size (kernel_size) to 5, 
        # the convolution step size stride to 1, and padding to 2
        self.conv1 = Conv2D(in_channels = 1, out_channels = 20, kernel_size = 5, stride = 1, padding = 2)
        # Define the pooling layer 1 with  
        # the size of the pooling kernel (kernel_size) to 2, 
        # the pooling step to 2
        self.max_pool1 = MaxPool2D(kernel_size = 2, stride = 2)
        # Define the convolutional layer 2
        self.conv2 = Conv2D(in_channels = 20, out_channels = 20, kernel_size = 5, stride = 1, padding = 2)
        # Define the pooling layer 2
        self.max_pool2 = MaxPool2D(kernel_size = 2, stride = 2)
        # Define a fully connected layer
        self.fc = Linear(in_features = 980, out_features = 10)

    # define farword calculation function, the activation function of hidden layers is ReLU
    def forward(self, inputs, labels=None):
        inputs = paddle.reshape(inputs, [inputs.shape[0], 1, 28, 28])
        x = self.conv1(inputs)
        x = F.relu(x)
        x = self.max_pool1(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = self.max_pool2(x)
        x = paddle.reshape(x, [x.shape[0], -1])
        x = self.fc(x)

        # accuracy
        # labels = paddle.unsqueeze(labels, axis = 1)
        # x = paddle.to_tensor(images)
        # labels = paddle.to_tensor(labels)
        if labels is not None:
            labels = paddle.reshape(labels, [labels.shape[0], 1])
            acc = paddle.metric.accuracy(input = x, label = labels)
            return x, acc
        else:
            return x

In [254]:
def load_data(mode = 'train', BATCH_SIZE = 100):
    # Read Data and Divide Dataset
    datafile = './work/mnist.json.gz'
    print('loading mnist dataset from {} ......'.format(datafile))
    ## load json data fiel
    data = json.load(gzip.open(datafile))
    print('mnist dataset load done')
    ## unpackage the data into training set, verifying set and testing set.
    train_set, val_set, eval_set = data
    
    # print(len(train_set[0][1]), train_set[0][1])
    # print(len(train_set[0][3]), train_set[0][3])
    # print(len(train_set[0][10000]), train_set[0][10000])
    
    if mode == 'train':
        imgs, labels = train_set[0], train_set[1]
    elif mode == 'valid':
        imgs, labels = val_set[0], val_set[1]
    elif mode == 'eval':
        imgs, labels = eval_set[0], eval_set[1]
    else:
        raise Exception('mode can only be one of [\'train\', \'valid\', \'eval\']')
    print('The number of {} data: {}'.format(mode, len(imgs)))

    # Test Data Validity
    imgs_length = len(imgs)

    assert len(imgs) == len(labels), \
        "length of train_imgs({}) should be the same as train_labels({})".format(len(imgs), len(labels))

    # Generate batch data
    # define the index of every singel data
    index_list = list(range(imgs_length))
    # Shuffle the index of data
    random.shuffle(index_list)
    
    # data generator:
    def data_generator():
        imgs_list = []
        labels_list = []
    
        for i in index_list:
            img = np.array(imgs[i]).astype('float32')
            # norm_img(img)
            label = np.array(labels[i]).astype('int64')
            imgs_list.append(img)
            labels_list.append(label)
            if len(imgs_list) == BATCH_SIZE:
                # get a dataset with length BATCHSIZE
                yield np.array(imgs_list), np.array(labels_list)
                # clear
                imgs_list = []
                labels_list = []
    
        if len(imgs_list) > 0:
            yield np.array(imgs_list), np.array(labels_list)

    return data_generator

In [255]:
# choose optimizer;
def choose_opt(opt = 'Adam'):
    pass

# train model with cross-entropy
def train(model, EPOCH_NUM = 10, BATCH_SIZE = 100):
    model.train()
    # load data
    train_loader = load_data('train', BATCH_SIZE)
    # choose optimizer; 
    # opt = paddle.optimizer.SGD(learning_rate = 0.01, parameters = model.parameters())
    # opt = paddle.optimizer.Momentum(learning_rate = 0.01, momentum = 0.9, parameters = model.parameters())
    # opt = paddle.optimizer.Adagrad(learning_rate = 0.01, parameters = model.parameters())
    opt = paddle.optimizer.Adam(learning_rate = 0.01, parameters = model.parameters())
    opt = paddle.optimizer.Adam(learning_rate = 0.01, weight_decay = paddle.regularizer.L2Decay(coeff=1e-2), parameters = model.parameters())

    for epoch_id in range(EPOCH_NUM):
        for batch_id, data in enumerate(train_loader()):
            # 1. prepare data
            images, labels = data
            images = paddle.to_tensor(images)
            labels = paddle.to_tensor(labels)
            # print(labels)
            # 2. Forward calculation
            predicts, acc = model(images, labels)
            # print(predicts)
            # 3. Calculate losses with cross-entropy
            loss = F.cross_entropy(predicts, labels)
            avg_loss = paddle.mean(loss)
            # print loss every 200 batches of data
            if batch_id % 200 == 0:
                print("epoch: {} / batch: {}, loss = {}, acc = {}".format(epoch_id, batch_id, float(avg_loss), float(acc)))
            
            # 4. Backpropagation
            avg_loss.backward()
            opt.step()
            opt.clear_grad()

    # save model
    paddle.save(model.state_dict(), params_files_path)
    print('training is done! model has saved in \'./{}'.format(params_files_path))

In [263]:
def evaluation(model):
    print('starting evaluation...')
    param_dict = paddle.load(params_file_path)
    model.load_dict(param_dict)
    
    model.eval()
    eval_loader = load_data('eval')

    acc_set = []
    avg_loss_set = []
    for batch_id, data in enumerate(eval_loader()):
        # 1. prepare data
        images, labels = data
        images = paddle.to_tensor(images)
        labels = paddle.to_tensor(labels)
        # 2. Forward calculation
        
        predicts, acc = model(images, labels)
        loss = F.cross_entropy(predicts, labels)
        avg_loss = paddle.mean(loss)
        acc_set.append(float(acc))
        avg_loss_set.append(float(avg_loss))

    # calculation accuracy
    acc_val_mean = np.array(acc_set).mean()
    avg_loss_val_mean = np.array(avg_loss_set).mean()
    print('loss = {}, acc = {}'.format(avg_loss_val_mean, acc_val_mean))
    
    return acc_val_mean

In [264]:
def load_image(img_path):
    im = Image.open(img_path).convert('L')
    im = im.resize((28,28), Image.ANTIALIAS)
    im = np.array(im).reshape(1,1,28,28).astype(np.float32)
    # normalize image
    im = 1 - im * 2 / 255.0
    # print(im)
    return im

In [265]:
model = MNIST()

In [259]:
# train model
train(model)

loading mnist dataset from ./work/mnist.json.gz ......
mnist dataset load done
The number of train data: 50000
epoch: 0 / batch: 0, loss = 4.867584705352783, acc = 0.10000000149011612
epoch: 0 / batch: 200, loss = 2.300097703933716, acc = 0.11999999731779099
epoch: 0 / batch: 400, loss = 0.5449981093406677, acc = 0.8500000238418579
epoch: 1 / batch: 0, loss = 0.23122312128543854, acc = 0.9399999976158142
epoch: 1 / batch: 200, loss = 0.4233676791191101, acc = 0.8700000047683716
epoch: 1 / batch: 400, loss = 0.1553206443786621, acc = 0.949999988079071
epoch: 2 / batch: 0, loss = 0.19009146094322205, acc = 0.9399999976158142
epoch: 2 / batch: 200, loss = 0.32649925351142883, acc = 0.8899999856948853
epoch: 2 / batch: 400, loss = 0.15676245093345642, acc = 0.9599999785423279
epoch: 3 / batch: 0, loss = 0.16446207463741302, acc = 0.9399999976158142
epoch: 3 / batch: 200, loss = 0.2841870188713074, acc = 0.8999999761581421
epoch: 3 / batch: 400, loss = 0.16269247233867645, acc = 0.949999988

In [266]:
# load model
param_dict = paddle.load(params_file_path)
model.load_dict(param_dict)
# load data
model.eval()
evaluation(model)

starting evaluation...
loading mnist dataset from ./work/mnist.json.gz ......
mnist dataset load done
The number of eval data: 10000
loss = 0.16657190918922424, acc = 0.9507000052928924


0.9507000052928924

In [278]:
for i in range(10):
    img_path = 'work/{}.jpg'.format(i)
    tensor_img = load_image(img_path)
    results = model(paddle.to_tensor(tensor_img))
    # print(results)
    lab = np.argsort(results.numpy())
    print('The prediction of file {} is {}'.format(img_path, lab[0][-1]))

The prediction of file work/0.jpg is 0
The prediction of file work/1.jpg is 8
The prediction of file work/2.jpg is 2
The prediction of file work/3.jpg is 4
The prediction of file work/4.jpg is 4
The prediction of file work/5.jpg is 4
The prediction of file work/6.jpg is 6
The prediction of file work/7.jpg is 7
The prediction of file work/8.jpg is 2
The prediction of file work/9.jpg is 7
