# Designing Custom Layer With Gluon

In [1]:
import mxnet as mx
import utils

from mxnet import nd
from mxnet import gluon
from mxnet import autograd
mx.random.seed(1)

ctx = mx.cpu()

In [2]:
batch_size = 64
train_data, test_data = utils.load_dataset(batch_size, data_type='mnist')

自定义一个0均值层，其作用是将所有的输入减去均值使得其大致以0为中心

## 设计一个简单的自定义层

In [3]:
class CenteredLayer(gluon.Block):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
    
    def forward(self, X):
        return X - nd.mean(X)

In [4]:
net1 = CenteredLayer()
data = nd.arange(6).reshape((2,3))
net1(data)


[[-2.5 -1.5 -0.5]
 [ 0.5  1.5  2.5]]
<NDArray 2x3 @cpu(0)>

我们可以将其加入我们的神经网络中，因为我们使用了``Dense``，因此我们需要初始化我们的``net``

In [5]:
net2 = gluon.nn.Sequential()
net2.add(gluon.nn.Dense(64))
net2.add(gluon.nn.Dense(10))
net2.add(net1)

In [6]:
net2.collect_params().initialize(mx.init.Normal(sigma=0.01), ctx=ctx, force_reinit=True)
net2.collect_params()

sequential0_ (
  Parameter dense0_weight (shape=(64, 0), dtype=<class 'numpy.float32'>)
  Parameter dense0_bias (shape=(64,), dtype=<class 'numpy.float32'>)
  Parameter dense1_weight (shape=(10, 0), dtype=<class 'numpy.float32'>)
  Parameter dense1_bias (shape=(10,), dtype=<class 'numpy.float32'>)
)

In [7]:
for data, _ in train_data:
    data = data.as_in_context(ctx)
    print(data.shape)
    break

(64, 1, 28, 28)


In [8]:
output = net2(data)
o1 = output[:1]
o1


[[-0.0016136   0.00496426 -0.00564828 -0.00220915  0.00431405 -0.00098155
  -0.01009783 -0.00934961  0.00074095  0.00453386]]
<NDArray 1x10 @cpu(0)>

In [9]:
net2.collect_params()

sequential0_ (
  Parameter dense0_weight (shape=(64, 784), dtype=<class 'numpy.float32'>)
  Parameter dense0_bias (shape=(64,), dtype=<class 'numpy.float32'>)
  Parameter dense1_weight (shape=(10, 64), dtype=<class 'numpy.float32'>)
  Parameter dense1_bias (shape=(10,), dtype=<class 'numpy.float32'>)
)

我们可以验证一下其均值是不是大约为0.但是因为MXNet使用低精度的浮点运算，因此其结果不会刚好为0，但是事实证明，大多数深度学习算法不会因为精度的损失而收到太多影响(**这是因为MXNet默认使用32位float，会带来一定的浮点精度误差**。)

In [10]:
nd.mean(o1)


[-0.00153469]
<NDArray 1 @cpu(0)>

## 设计带模型参数的自定义层

我们的CenteredLayer设计的很好，但是有一个缺点是，他没有可以学习的参数，因此，我们需要构建自带可学习参数的自定义层，一个想法是我们之前用net[0].weight.data()打印出了参数，我们也可以显式自定义这样一个参数。

In [11]:
type(net2[0].weight)

mxnet.gluon.parameter.Parameter

In [12]:
my_param = gluon.Parameter('my_exciting_param', grad_req='write', shape=(5,5))

In [13]:
my_param

Parameter my_exciting_param (shape=(5, 5), dtype=<class 'numpy.float32'>)

In [14]:
my_param.name

'my_exciting_param'

In [15]:
my_param.initialize(ctx=ctx, force_reinit=True)
my_param.data(), my_param.grad()

(
 [[ 0.01025083 -0.00524203  0.06809364  0.05144259  0.03570646]
  [-0.06397808  0.05853816  0.02415648  0.06541369  0.03648888]
  [-0.02733507  0.06972782  0.02532352 -0.02712101  0.05985367]
  [-0.06086577 -0.03337502  0.05952872  0.04966065  0.06153975]
  [ 0.01351636  0.01735146  0.03452943  0.05660085  0.06955645]]
 <NDArray 5x5 @cpu(0)>, 
 [[ 0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.]]
 <NDArray 5x5 @cpu(0)>)

一般在创建自定义层的时候，我们不会直接使用``Parameter``，而是使用``Block``自带的成员变量params来构建一个``ParameterDict``，它是一个字典，用名字来指向一个``Parameter``。

In [16]:
my_param_dict = gluon.ParameterDict(prefix='self_block0_')

In [17]:
my_param_dict.get('my_exciting_param', grad_req='write', shape=(5,5))

Parameter self_block0_my_exciting_param (shape=(5, 5), dtype=<class 'numpy.float32'>)

In [18]:
my_param_dict

self_block0_ (
  Parameter self_block0_my_exciting_param (shape=(5, 5), dtype=<class 'numpy.float32'>)
)

## 自定义``Dense``层

In [19]:
class MyDense(gluon.Block):
    def __init__(self, out_units, in_units=0, **kwargs):
        super().__init__(**kwargs)
        self.weight = self.params.get('weight', grad_req='write', shape=(in_units, out_units))
        self.bias = self.params.get('bias', grad_req='write', shape=(out_units, ))
        
    def forward(self, X):
        X = nd.dot(X, self.weight.data()) + self.bias.data()
        return nd.relu(X)

In [20]:
dense0 = MyDense(10, in_units=64, prefix="my_first_dense_")
dense0.initialize(ctx=ctx)

In [21]:
X = nd.random.normal(shape=(10, 64))
dense0(X)


[[ 0.03527642  0.53290546  0.          0.13565981  0.          0.45349967
   0.          0.06523603  0.          0.        ]
 [ 0.          0.          0.          0.09760787  0.          0.35007051
   0.          0.1961149   0.08307152  0.        ]
 [ 0.37568897  0.          0.          0.30651212  0.25077805  0.55385858
   0.          0.12833555  0.          0.12669098]
 [ 0.10223103  0.          0.          0.          0.          0.41138449
   0.14127377  0.75098336  0.49911657  0.        ]
 [ 0.12405048  0.          0.09196598  0.          0.          0.          0.
   0.          0.34441897  0.        ]
 [ 0.          0.75866747  0.          0.          0.          0.33118731
   0.15483461  0.          0.1559445   0.10254275]
 [ 0.08817963  0.30868927  0.50328356  0.17626628  0.          0.4166604
   0.26583487  0.          0.          0.50513512]
 [ 0.10270769  0.72153813  0.          0.21675843  0.          0.22371989
   0.56618565  0.07360955  0.          0.        ]
 [ 0.429

In [22]:
dense0.collect_params()

my_first_dense_ (
  Parameter my_first_dense_weight (shape=(64, 10), dtype=<class 'numpy.float32'>)
  Parameter my_first_dense_bias (shape=(10,), dtype=<class 'numpy.float32'>)
)

In [23]:
dense0.name

'my_first_dense'

## 调用我们的层

In [24]:
net = gluon.nn.Sequential()
with net.name_scope():
    net.add(MyDense(64, in_units=784))
    net.add(MyDense(10, in_units=64))
net.initialize(ctx=ctx)

In [25]:
X = nd.random.normal(shape=(10, 784))
net(X)


[[ 0.34072906  0.          0.12865672  0.24016923  0.19848743  0.0227394
   0.          0.6005851   0.24969734  0.        ]
 [ 0.05254082  0.          0.29176399  0.02812026  0.81029677  0.
   0.07850704  0.04315003  0.04554114  0.17009598]
 [ 0.          0.          0.17560008  0.          0.38822314  0.          0.
   0.          0.32893378  0.11164059]
 [ 0.          0.          0.22787735  0.          0.          0.          0.
   0.10587077  0.02748572  0.        ]
 [ 0.          0.10675503  0.20037743  0.16993076  0.19653216  0.          0.
   0.36772966  0.          0.        ]
 [ 0.          0.          0.55489254  0.          0.24004366  0.
   0.06843384  0.39324629  0.          0.        ]
 [ 0.03704725  0.12384593  0.21797663  0.02956173  0.53780967  0.          0.
   0.31945056  0.          0.09588722]
 [ 0.07582741  0.          0.40071988  0.          0.35468271  0.          0.
   0.12891023  0.          0.10195183]
 [ 0.          0.          0.60345781  0.          0.567

## 结论

我们可以使用**延迟初始化**来改进我们的层。仔细想想``gluon``初始化层的时候并不需要指定``in_units``。

## 再自定义一个层

In [26]:
class MyDense(gluon.Block):
    def __init__(self, units, in_units = 0, **kwargs):
        super().__init__(**kwargs)
        self.weight = self.params.get('weight', init=mx.init.Xavier(magnitude=2.24), shape=(in_units, units))
        self.bias = self.params.get('bias', shape=(units, ))
        
    def forward(self, X):
        h1 = nd.dot(X, self.weight.data()) + self.bias.data()
        return nd.relu(h1)

In [27]:
dense1 = MyDense(10, in_units=20)
dense1.initialize(ctx=ctx)
X = nd.random.normal(shape=(10, 20))
dense1(X)


[[ 0.99427682  0.22104916  0.71425343  0.          0.88457561  1.15234816
   0.          0.75347143  0.42495024  0.55035073]
 [ 0.52092326  0.          0.34267759  1.09945452  0.76766425  0.59529632
   0.5251826   0.06589224  0.          2.13549662]
 [ 0.63272196  0.          1.96432257  0.00668729  0.          0.          0.
   0.41263798  0.48197559  0.        ]
 [ 0.          0.          0.39700577  0.          0.          0.24702752
   0.          2.4111433   0.23491818  0.        ]
 [ 2.22609687  0.          0.          1.82167459  0.50350052  0.64773893
   0.          0.          0.          0.36311743]
 [ 0.          0.37025431  0.          0.          1.06610429  0.67238659
   0.          0.          0.39882302  0.        ]
 [ 1.62146688  0.3722173   1.53305316  0.          0.          0.50304931
   0.          0.96356767  0.          0.        ]
 [ 0.98214442  0.33266097  1.10612369  0.          0.56148487  0.          0.
   0.          0.          0.        ]
 [ 0.          

In [31]:
dense1.collect_params()

mydense0_ (
  Parameter mydense0_weight (shape=(20, 10), dtype=<class 'numpy.float32'>)
  Parameter mydense0_bias (shape=(10,), dtype=<class 'numpy.float32'>)
)

## 使用自定义层训练

In [28]:
net2 = gluon.nn.Sequential()
with net2.name_scope():
    net2.add(gluon.nn.Flatten())
    net2.add(MyDense(256, in_units=784))
    net2.add(MyDense(256, in_units=256))
    net2.add(MyDense(10, in_units=256))
net2.collect_params().initialize(ctx=ctx)

In [29]:
utils.evaluate_accuracy_gluon(test_data, net2, ctx)

0.093600000000000003

In [30]:
trainer = gluon.Trainer(net2.collect_params(), 'sgd', {'learning_rate': 0.1})

epochs = 2
num_examples = 60000

for epoch in range(epochs):
    cumulative_loss = .0
    for i, (data, label) in enumerate(train_data):
        data = data.as_in_context(ctx)
        label = label.as_in_context(ctx)
        label_one_hot = nd.one_hot(label, 10)
        with autograd.record():
            output = net2(data)
            loss = utils.softmax_cross_entropy(output, label_one_hot)
        loss.backward()
        trainer.step(batch_size)
        cumulative_loss += nd.sum(loss) # 发现了一个很奇怪的事情，为什么不用asscalar()，acc就会上去，loss也会降下来
        
    train_acc = utils.evaluate_accuracy_gluon(train_data, net2, ctx)
    test_acc = utils.evaluate_accuracy_gluon(test_data, net2, ctx)
    print("Epoch %s, Train loss %s, Train acc %s, Test acc %s." 
         %(epoch, cumulative_loss / num_examples, train_acc, test_acc))

Epoch 0, Train loss 
[ 2.72249746]
<NDArray 1 @cpu(0)>, Train acc 0.84815, Test acc 0.8486.
Epoch 1, Train loss 
[ 2.3947711]
<NDArray 1 @cpu(0)>, Train acc 0.868483333333, Test acc 0.8666.
