# 从0开始卷积神经网络

卷积层中的每个数据层都代表一个3维输入张量$(in\_channels \times height \times weight)$，此外，我们想要经过卷积层的输出通道数设为$ouput\_channels$，因此，每个卷积层我们都用一个四维的张量来表示，即：

$$(output\_channels \times input\_channels \times height \times width)$$

这些四维张量组成了一个权重矩阵$W$，其中权重矩阵的各个参数：
* $output\_channels$ : 表示我们想要该卷积层中有多少个输出通道
* $in\_channels$ : 表示该层的输入通道数，也即上一层的输出通道数
* $height$ : 卷积核(Kernel)的高
* $width$ : 卷积核(kernel)的宽

而对应的在卷积神经网络中我们的输入数据的格式均为四维张量：

$$(batch \times channel \times height \times width)$$

<img src="http://zh.gluon.ai/_images/no_padding_no_strides.gif">

<font color="red">一定要记住卷积层中的**权重**的张量格式以及我们**输入输出**数据的格式，这样就很好办了</font>

下面我们使用卷积神经网络实现MNIST分类，**这是我们最后一次使用MNIST数据集**。

## 加载数据

In [1]:
import mxnet as mx
import numpy as np

from mxnet import nd
from mxnet import gluon
from mxnet import autograd

import utils
ctx = mx.gpu()

In [2]:
num_outputs = 10
num_inputs = 784
batch_size = 128

train_data, test_data = utils.load_dataset(batch_size, data_type='mnist')

## 定义权重

In [None]:
weight_scale = .01
num_filter_conv1 = 20
num_filter_conv2 = 50

############# Conv layer ###############
W1 = nd.random.normal(shape=(num_filter_conv1, 1, 3, 3), scale=weight_scale, ctx=ctx)
b1 = nd.random.normal(shape=num_filter_conv1, scale=weight_scale, ctx=ctx)

W2 = nd.random.normal(shape=(num_filter_conv2, num_filter_conv1, 5, 5), scale=weight_scale, ctx=ctx)
b2 = nd.random.normal(shape=num_filter_conv2, scale=weight_scale, ctx=ctx)

############# FC layer ###############

num_input_fc1 = 800
num_output_fc1 = 128

W3 = nd.random.normal(shape=(num_input_fc1, num_output_fc1), scale=weight_scale, ctx=ctx)
b3 = nd.random.normal(shape=num_output_fc1, scale=weight_scale, ctx=ctx)

W4 = nd.random.normal(shape=(num_output_fc1, num_outputs), scale=weight_scale, ctx=ctx)
b4 = nd.random.normal(shape=num_outputs, scale=weight_scale, ctx=ctx)

############# attach grad ###############

params = [W1, b1, W2, b2, W3, b3, W4, b4]

for param in params:
    param.attach_grad()

## 维度测试

**<font color="red">使用卷积神经网络一定要进行维度测试**

In [None]:
for data, _ in train_data:
    data = data.as_in_context(ctx)
    break

print("data shape : ", data.shape)
    
conv1 = nd.Convolution(data=data, weight=W1, bias=b1, kernel=W1.shape[2:], 
                       stride=(1,1), num_filter=W1.shape[0])
print("conv1 shape : ", conv1.shape)

pool1 = nd.Pooling(data=conv1, kernel=(2,2), pool_type='max', stride=(2,2))
print("pool1 shape : ", pool1.shape)

conv2 = nd.Convolution(data=pool1, weight=W2, bias=b2, kernel=W2.shape[2:], 
                       stride=(1,1), num_filter=W2.shape[0])
print("conv1 shape : ", conv2.shape)

pool2 = nd.Pooling(data=conv2, kernel=(2,2), pool_type='max', stride=(2,2))
print("pool2 shape : ", pool2.shape)

############# 由此，我们可以确定我们全连接层要Flatten的input_units输入为800 (50*4*4) #############

flatten = nd.Flatten(pool2)
print("flatten shape : ", flatten.shape)

fc1 = nd.dot(flatten, W3) + b3
print("fc1 shape : ", fc1.shape)

fc2 = nd.dot(fc1, W4) + b4
print("fc2 shape : ", fc2.shape)

## 定义模型

In [None]:
def relu(X):
    return nd.maximum(X, nd.zeros_like(X))

In [None]:
def net(X):
    
    # Conv1 Layer 
    conv1 = nd.Convolution(data=X, weight=W1, bias=b1, kernel=W1.shape[2:], 
                       stride=(1,1), num_filter=W1.shape[0])
    pool1 = nd.Pooling(data=conv1, kernel=(2,2), pool_type="max", stride=(2,2))
    relu1 = relu(pool1)
    
    # Conv2 Layer 
    conv2 = nd.Convolution(data=relu1, weight=W2, bias=b2, kernel=W2.shape[2:],
                          stride=(1,1), num_filter=W2.shape[0])
    pool2 = nd.Pooling(data=conv2, kernel=(2,2), pool_type="max", stride=(2,2))
    relu2 = relu(pool2)
    
    # fc1 Layer 
    flatten = nd.Flatten(relu2)
    fc1 = nd.dot(flatten, W3) + b3
    relu3 = relu(fc1)
    
    # fc2 Layer
    fc2 = nd.dot(relu3, W4) + b4
    return fc2

## 定义损失函数和优化器

In [None]:
def softmax(ylinear):
    yexp = nd.exp(ylinear - nd.max(ylinear).asscalar())
    partition = yexp / nd.sum(yexp, axis=1).reshape((-1, 1))
    return partition
 
def softmax_cross_entropy(yhat, y):
    return -nd.sum(y * nd.log(softmax(yhat)), axis=1)

In [None]:
def SGD(params, lr):
    for param in params:
        param[:] = param - lr * param.grad

## 定义评估函数

In [None]:
def evaluate_accuracy(img_iter, net, ctx):
    acc = mx.metric.Accuracy()
    for i, (data, label) in enumerate(img_iter):
        data = data.as_in_context(ctx)
        label = label.as_in_context(ctx)
        preds = nd.argmax(net(data), axis=1)
        acc.update(preds=preds, labels=label)
    return acc.get()[1]

In [None]:
evaluate_accuracy(train_data, net, ctx)

## 训练

In [None]:
epochs = 10
learning_rate = 0.001
num_examples = 60000

for epoch in range(epochs):
    cumulative_loss = .0
    for i, (data, label) in enumerate(train_data):
        data = data.as_in_context(ctx)
        label = label.as_in_context(ctx)
        label_one_hot = nd.one_hot(label, 10)
        with autograd.record():
            output = net(data)
            loss = softmax_cross_entropy(output, label_one_hot)
        loss.backward()
        SGD(params, learning_rate)
        cumulative_loss += nd.sum(loss).asscalar()
    
    train_acc = evaluate_accuracy(train_data, net, ctx)
    test_acc = evaluate_accuracy(test_data, net, ctx)
    print("Epoch %s, Train Avg Loss %s, Train acc %s, Test acc %s"
         % (epoch, cumulative_loss / num_examples, train_acc, test_acc))