In [1]:
from mxnet import init,nd
from mxnet.gluon import nn

net=nn.Sequential()
net.add(nn.Dense(256,activation='relu'))
net.add(nn.Dense(10))
net.initialize()# 使⽤默认初始化⽅式,为[0.07,-0.07]的均匀分布(init.Uniform(scale=0.7))

X=nd.random.uniform(shape=(2,20))
Y=net(X) # 前向计算 


In [2]:
# 4.2.1 访问模型参数 
# 可以通过Block类的params属性来访问该层包含的所有参数。
net[0].params,type(net[0].params)

(dense0_ (
   Parameter dense0_weight (shape=(256, 20), dtype=float32)
   Parameter dense0_bias (shape=(256,), dtype=float32)
 ),
 mxnet.gluon.parameter.ParameterDict)

In [3]:
net[0].params['dense0_weight'],net[0].weight

(Parameter dense0_weight (shape=(256, 20), dtype=float32),
 Parameter dense0_weight (shape=(256, 20), dtype=float32))

In [4]:
# Gluon⾥参数类型为Parameter类，它包含参数和梯度的数值，可以分别通过data函数和grad函数来访问。
net[0].weight.data()


[[ 0.06700657 -0.00369488  0.0418822  ... -0.05517294 -0.01194733
  -0.00369594]
 [-0.03296221 -0.04391347  0.03839272 ...  0.05636378  0.02545484
  -0.007007  ]
 [-0.0196689   0.01582889 -0.00881553 ...  0.01509629 -0.01908049
  -0.02449339]
 ...
 [ 0.00010955  0.0439323  -0.04911506 ...  0.06975312  0.0449558
  -0.03283203]
 [ 0.04106557  0.05671307 -0.00066976 ...  0.06387014 -0.01292654
   0.00974177]
 [ 0.00297424 -0.0281784  -0.06881659 ... -0.04047417  0.00457048
   0.05696651]]
<NDArray 256x20 @cpu(0)>

In [5]:
# 权重梯度的形状和权重的形状⼀样。
net[0].weight.grad()


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
<NDArray 256x20 @cpu(0)>

In [6]:
net[1].bias.data()


[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
<NDArray 10 @cpu(0)>

In [7]:
# 使⽤collect_params函数来获取net变量所有嵌套（例如通过add函数嵌套）的层所包含的所有参数。
# 它返回的同样是⼀个由参数名称到参数实例的字典。 
net.collect_params(),type(net.collect_params())

(sequential0_ (
   Parameter dense0_weight (shape=(256, 20), dtype=float32)
   Parameter dense0_bias (shape=(256,), dtype=float32)
   Parameter dense1_weight (shape=(10, 256), dtype=float32)
   Parameter dense1_bias (shape=(10,), dtype=float32)
 ),
 mxnet.gluon.parameter.ParameterDict)

In [8]:
# collect_params可以通过正则表达式来匹配参数名，从而筛选需要的参数。
net.collect_params('.*weight')

sequential0_ (
  Parameter dense0_weight (shape=(256, 20), dtype=float32)
  Parameter dense1_weight (shape=(10, 256), dtype=float32)
)

In [9]:
# 4.2.2 初始化模型参数 
# init.Normal(sigma=0.01)将权重参数初始化成均值为0、标准差为0.01的正态分布随机数，并依然将偏差参数清零。
# ⾮⾸次对模型初始化需要指定force_reinit为真 ,代表强制重新初始化，默认False
net.initialize(init=init.Normal(sigma=0.01),force_reinit=True)
net[0].weight.data()[0]


[ 0.00195949 -0.0173764   0.00047347  0.00145809  0.00326049  0.00457878
 -0.00894258  0.00493839 -0.00904343 -0.01214079  0.02156406  0.01093822
  0.01827143 -0.0104467   0.01006219  0.0051742  -0.00806932  0.01376901
  0.00205885  0.00994352]
<NDArray 20 @cpu(0)>

In [10]:
# init.Constant使⽤常数来初始化权重参数。
net.initialize(init=init.Constant(1),force_reinit=True)
net[0].weight.data()[0]


[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
<NDArray 20 @cpu(0)>

In [11]:
# net[0].weight为Parameter类
# 只对某个特定参数进⾏初始化。调⽤Parameter类的initialize函数，它 与Block类提供的initialize函数的使⽤⽅法⼀致。
net[0].weight.initialize(init=init.Xavier(),force_reinit=True)
net[0].weight.data()[0]


[ 0.00512482 -0.06579044 -0.10849719 -0.09586414  0.06394844  0.06029618
 -0.03065033 -0.01086642  0.01929168  0.1003869  -0.09339568 -0.08703034
 -0.10472868 -0.09879824 -0.00352201 -0.11063069 -0.04257748  0.06548801
  0.12987629 -0.13846186]
<NDArray 20 @cpu(0)>

In [12]:
# 4.2.3 ⾃定义初始化方法
# init.Uniform、Normal、Xavier均为Initialize的子类
class MyInit(init.Initializer):
    def _init_weight(self,name,data):
        print('Init',name,data.shape)
        data[:]=nd.random.uniform(low=-10,high=10,shape=data.shape)
        data*=data.abs()>=5
        
net.initialize(MyInit(),force_reinit=True)
net[0].weight.data()[0]

Init dense0_weight (256, 20)
Init dense1_weight (10, 256)



[-5.3659673  7.5773945  8.986376  -0.         8.827555   0.
  5.9840508 -0.         0.         0.         7.4857597 -0.
 -0.         6.8910007  6.9788704 -6.1131554  0.         5.4665203
 -9.735263   9.485172 ]
<NDArray 20 @cpu(0)>

In [13]:
# 通过Parameter类的set_data函数来直接改写模型参数。
net[0].weight.set_data(net[0].weight.data()+1)
net[0].weight.data()[0]


[-4.3659673  8.5773945  9.986376   1.         9.827555   1.
  6.9840508  1.         1.         1.         8.48576    1.
  1.         7.8910007  7.9788704 -5.1131554  1.         6.4665203
 -8.735263  10.485172 ]
<NDArray 20 @cpu(0)>

In [14]:
# 4.2.4 共享模型参数 
# 在构造第三隐藏层时通过params来指定它使⽤第⼆隐藏层的参数。因此模型的第二隐藏层（shared变量）和第三隐藏层共享模型参数。
# 因为模型参数⾥包含了 梯度，所以在反向传播计算时，第⼆隐藏层和第三隐藏层的梯度都会被累加在shared.params.grad()⾥。 
net=nn.Sequential()
shared=nn.Dense(8,activation='relu')
net.add(nn.Dense(8,activation='relu'),
       shared,
       nn.Dense(8,activation='relu',params=shared.params),
       nn.Dense(10))
net.initialize()
X=nd.random.uniform(shape=(2,20))
net(X)

net[1].weight.data()[0] == net[2].weight.data()[0]
net[0].weight.grad()


[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
<NDArray 8x20 @cpu(0)>

In [15]:
# 练习1.尝试在net.initialize()后、net(X)前访问模型参数，观察模型参数的形状。
# net.initialize()后、net(X)前访问模型参数发现模型参数的shape为(1,0)，
# 可见权重w实际的初始化发生在第一个正向传播过程中(即net(X)后，net.initialize()并未初始化)
net=nn.Sequential()
net.add(nn.Dense(1))
net[0].weight
net.initialize()
net[0].weight

Parameter dense6_weight (shape=(1, 0), dtype=float32)

In [16]:
# 练习2.构造⼀个含共享参数层的多层感知机并训练。在训练过程中，观察每⼀层的模型参数和梯 度。 
import d2lzh as d2l
from mxnet import gluon,init,autograd
from mxnet.gluon import loss as gloss,nn

net=nn.Sequential()
shared=nn.Dense(8,activation='relu')
net.add(nn.Dense(8,activation='relu'),
        shared,
        nn.Dense(8,activation='relu',params=shared.params),
        nn.Dense(10))
net.initialize(init.Normal(sigma=0.01))

batch_size,num_epochs=256,5
loss=gloss.SoftmaxCrossEntropyLoss()
train_iter,test_iter=d2l.load_data_fashion_mnist(batch_size)
trainer=gluon.Trainer(net.collect_params(),'sgd',{'learning_rate':0.5})
for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n = 0.0, 0.0, 0
        for X, y in train_iter:
            with autograd.record():
                y_hat = net(X)
                l = loss(y_hat, y).sum()
            l.backward()
            if trainer is None:
                sgd(params, lr, batch_size)
            else:
                trainer.step(batch_size)
            y = y.astype('float32')
            train_l_sum += l.asscalar()
            train_acc_sum += (y_hat.argmax(axis=1) == y).sum().asscalar()
            n += y.size
        test_acc = d2l.evaluate_accuracy(test_iter, net)
        print(net[1].weight.data()==net[2].weight.data())



[[1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]]
<NDArray 8x8 @cpu(0)>

[[1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]]
<NDArray 8x8 @cpu(0)>

[[1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]]
<NDArray 8x8 @cpu(0)>

[[1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]]
<NDArray 8x8 @cpu(0)>

[[1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1

In [17]:
import mxnet


mxnet.init??




net.initialize??
init.Constant??
mxnet.gluon.parameter.Parameter.initialize??
net.initialize??

init.Xavier??
init.Initializer??
mxnet.gluon.parameter.Parameter??

nn.HybridBlock??
nn.Dense??
gluon.Trainer??
init.Uniform??