In [9]:
import mxnet as mx
from mxnet import init, nd
from mxnet.gluon import nn

net = nn.Sequential()
net.add(nn.Dense(256, activation='relu'))
net.add(nn.Dense(10))
net.initialize(ctx=mx.gpu())

X = nd.random.uniform(shape=(2, 20), ctx=mx.gpu())
Y = net(X) # 前向计算

In [10]:
net[0].params, type(net[0].params) # 通过方括号[]来访问网络的任一层

(dense4_ (
   Parameter dense4_weight (shape=(256, 20), dtype=float32)
   Parameter dense4_bias (shape=(256,), dtype=float32)
 ),
 mxnet.gluon.parameter.ParameterDict)

In [11]:
net[1].params, type(net[1].params) 

(dense5_ (
   Parameter dense5_weight (shape=(10, 256), dtype=float32)
   Parameter dense5_bias (shape=(10,), dtype=float32)
 ),
 mxnet.gluon.parameter.ParameterDict)

In [12]:
net[0].params['dense4_weight'], net[0].weight # 两者方法等价
# 每执行一次，会有增加一个dense？

(Parameter dense4_weight (shape=(256, 20), dtype=float32),
 Parameter dense4_weight (shape=(256, 20), dtype=float32))

In [14]:
net[0].weight.data() # 分别通过data函数和grad函数来访问


[[-0.04247847  0.06293995 -0.01837847 ... -0.06219994  0.01467837
  -0.00683771]
 [ 0.0334969  -0.06720173 -0.06451371 ... -0.0396449   0.0269461
   0.00912645]
 [ 0.0093242   0.05111437 -0.03284547 ...  0.02060438  0.03028581
   0.04779406]
 ...
 [ 0.0268476   0.06148554 -0.04265065 ...  0.00752284 -0.04634099
  -0.06273054]
 [-0.00812264  0.01937782  0.05937877 ... -0.04052389  0.01052459
   0.05550297]
 [-0.03866927  0.02501589  0.04987388 ... -0.02471803  0.03994805
  -0.05723546]]
<NDArray 256x20 @gpu(0)>

In [15]:
# 还没有进行反向传播计算，所以梯度的值全为0
net[0].weight.grad()


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
<NDArray 256x20 @gpu(0)>

In [16]:
net[1].bias.data()


[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
<NDArray 10 @gpu(0)>

In [17]:
# 获取net变量所有嵌套（例如通过add函数嵌套）的层所包含的所有参数。
net.collect_params()

sequential2_ (
  Parameter dense4_weight (shape=(256, 20), dtype=float32)
  Parameter dense4_bias (shape=(256,), dtype=float32)
  Parameter dense5_weight (shape=(10, 256), dtype=float32)
  Parameter dense5_bias (shape=(10,), dtype=float32)
)

In [18]:
net.collect_params('.*weight')

sequential2_ (
  Parameter dense4_weight (shape=(256, 20), dtype=float32)
  Parameter dense5_weight (shape=(10, 256), dtype=float32)
)

In [19]:
# 非首次对模型初始化需要指定force_reinit为真
net.initialize(init=init.Normal(sigma=0.01), force_reinit=True)
net[0].weight.data()[0]


[ 0.00958589 -0.01497647  0.00660516 -0.00189036 -0.00273026  0.00918154
  0.02173063  0.000752    0.00071856  0.0112562  -0.00355964 -0.00253765
 -0.00040472  0.00496598  0.01380103  0.01802712  0.00645719  0.01811526
 -0.01252964  0.00387312]
<NDArray 20 @cpu(0)>

In [20]:
# https://www.cnblogs.com/adong7639/p/9547789.html
# 下面权重的初始化是使用了一种特殊的均匀分布，可以参考上面教程
net[0].weight.initialize(init=init.Xavier(), force_reinit=True)
net[0].weight.data()[0]


[ 0.14208373  0.1430282  -0.00637825  0.07477359 -0.00076924 -0.14624824
  0.0411282  -0.06797681 -0.03875229 -0.02639443 -0.10707226 -0.02116564
  0.09498733 -0.0596132  -0.09145886 -0.02915448  0.00333779 -0.11186215
 -0.08129447  0.1417506 ]
<NDArray 20 @cpu(0)>

In [24]:
class MyInit(init.Initializer):
    def _init_weight(self, name, data):
        print('Init', name, data.shape)
        data[:] = nd.random.uniform(low=-10, high=10, shape=data.shape)
        # 绝对值小于5的设置为0
        data *= data.abs() >= 5

net.initialize(MyInit(), force_reinit=True)
net[0].weight.data()[0]

Init dense4_weight (256, 20)

[[1. 0. 1. ... 1. 1. 0.]
 [0. 0. 1. ... 1. 1. 0.]
 [0. 0. 0. ... 0. 1. 1.]
 ...
 [0. 0. 1. ... 1. 1. 1.]
 [1. 1. 0. ... 1. 0. 0.]
 [0. 0. 1. ... 1. 0. 0.]]
<NDArray 256x20 @cpu(0)>
Init dense5_weight (10, 256)

[[1. 0. 0. ... 1. 1. 0.]
 [1. 0. 1. ... 1. 0. 1.]
 [1. 0. 0. ... 1. 1. 1.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 1.]
 [1. 0. 0. ... 0. 0. 1.]]
<NDArray 10x256 @cpu(0)>



[ 6.2993298 -0.         9.709829  -0.         9.379435   0.
  8.098967   7.9414024 -0.        -7.578802   9.840225  -5.169086
 -5.011599  -5.4347277 -7.881877  -0.         9.0190525  7.830454
 -5.331595  -0.       ]
<NDArray 20 @cpu(0)>

In [25]:
# 直接设置权重+1
net[0].weight.set_data(net[0].weight.data() + 1)
net[0].weight.data()[0]


[ 7.2993298  1.        10.709829   1.        10.379435   1.
  9.098967   8.941402   1.        -6.578802  10.840225  -4.169086
 -4.011599  -4.4347277 -6.881877   1.        10.0190525  8.830454
 -4.331595   1.       ]
<NDArray 20 @cpu(0)>

In [27]:
net = nn.Sequential()
shared = nn.Dense(8, activation='relu')
# 我们让模型的第二隐藏层（shared变量）和第三隐藏层共享模型参数。
net.add(nn.Dense(8, activation='relu'),
        shared,
        nn.Dense(8, activation='relu', params=shared.params),
        nn.Dense(10))
# net.initialize()

# X = nd.random.uniform(shape=(2, 20))
# net(X)

# net[1].weight.data()[0] == net[2].weight.data()[0]
# # 在反向传播计算时，第二隐藏层和第三隐藏层的梯度都会被累加在shared.params.grad()

In [33]:
net[0],net[1],net[2],net[3]

(Dense(None -> 8, Activation(relu)),
 Dense(None -> 8, Activation(relu)),
 Dense(None -> 8, Activation(relu)),
 Dense(None -> 10, linear))

In [34]:
net = nn.Sequential()
shared = nn.Dense(8, activation='relu')
# 我们让模型的第二隐藏层（shared变量）和第三隐藏层共享模型参数。
net.add(nn.Dense(8, activation='relu'),
        shared,
        nn.Dense(8, activation='relu', params=shared.params),
        nn.Dense(10))
net.initialize()

X = nd.random.uniform(shape=(2, 20))
net(X)

net[1].weight.data()[0] == net[2].weight.data()[0]
# 在反向传播计算时，第二隐藏层和第三隐藏层的梯度都会被累加在shared.params.grad()


[1. 1. 1. 1. 1. 1. 1. 1.]
<NDArray 8 @cpu(0)>