Yet another batch normalization PR #3229
Merged
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
Jump to file or symbol
Failed to load files and symbols.
| @@ -0,0 +1,28 @@ | ||
| +# reduce learning rate after 120 epochs (60000 iters) by factor 0f 10 | ||
| +# then another factor of 10 after 10 more epochs (5000 iters) | ||
| + | ||
| +# The train/test net protocol buffer definition | ||
| +net: "examples/cifar10/cifar10_full_sigmoid_train_test.prototxt" | ||
| +# test_iter specifies how many forward passes the test should carry out. | ||
| +# In the case of CIFAR10, we have test batch size 100 and 100 test iterations, | ||
| +# covering the full 10,000 testing images. | ||
| +test_iter: 10 | ||
| +# Carry out testing every 1000 training iterations. | ||
| +test_interval: 1000 | ||
| +# The base learning rate, momentum and the weight decay of the network. | ||
| +base_lr: 0.001 | ||
| +momentum: 0.9 | ||
| +#weight_decay: 0.004 | ||
| +# The learning rate policy | ||
| +lr_policy: "step" | ||
| +gamma: 1 | ||
| +stepsize: 5000 | ||
| +# Display every 200 iterations | ||
| +display: 100 | ||
| +# The maximum number of iterations | ||
| +max_iter: 60000 | ||
| +# snapshot intermediate results | ||
| +snapshot: 10000 | ||
| +snapshot_prefix: "examples/cifar10_full_sigmoid" | ||
| +# solver mode: CPU or GPU | ||
| +solver_mode: GPU |
| @@ -0,0 +1,28 @@ | ||
| +# reduce learning rate after 120 epochs (60000 iters) by factor 0f 10 | ||
| +# then another factor of 10 after 10 more epochs (5000 iters) | ||
| + | ||
| +# The train/test net protocol buffer definition | ||
| +net: "examples/cifar10/cifar10_full_sigmoid_train_test_bn.prototxt" | ||
| +# test_iter specifies how many forward passes the test should carry out. | ||
| +# In the case of CIFAR10, we have test batch size 100 and 100 test iterations, | ||
| +# covering the full 10,000 testing images. | ||
| +test_iter: 10 | ||
| +# Carry out testing every 1000 training iterations. | ||
| +test_interval: 1000 | ||
| +# The base learning rate, momentum and the weight decay of the network. | ||
| +base_lr: 0.001 | ||
| +momentum: 0.9 | ||
| +#weight_decay: 0.004 | ||
| +# The learning rate policy | ||
| +lr_policy: "step" | ||
| +gamma: 1 | ||
| +stepsize: 5000 | ||
| +# Display every 200 iterations | ||
| +display: 100 | ||
| +# The maximum number of iterations | ||
| +max_iter: 60000 | ||
| +# snapshot intermediate results | ||
| +snapshot: 10000 | ||
| +snapshot_prefix: "examples/cifar10_full_sigmoid_bn" | ||
| +# solver mode: CPU or GPU | ||
| +solver_mode: GPU |
| @@ -0,0 +1,212 @@ | ||
| +name: "CIFAR10_full" | ||
| +layer { | ||
| + name: "cifar" | ||
| + type: "Data" | ||
| + top: "data" | ||
| + top: "label" | ||
| + include { | ||
| + phase: TRAIN | ||
| + } | ||
| + transform_param { | ||
| + mean_file: "examples/cifar10/mean.binaryproto" | ||
| + } | ||
| + data_param { | ||
| + source: "examples/cifar10/cifar10_train_lmdb" | ||
| + batch_size: 111 | ||
| + backend: LMDB | ||
| + } | ||
| +} | ||
| +layer { | ||
| + name: "cifar" | ||
| + type: "Data" | ||
| + top: "data" | ||
| + top: "label" | ||
| + include { | ||
| + phase: TEST | ||
| + } | ||
| + transform_param { | ||
| + mean_file: "examples/cifar10/mean.binaryproto" | ||
| + } | ||
| + data_param { | ||
| + source: "examples/cifar10/cifar10_test_lmdb" | ||
| + batch_size: 1000 | ||
| + backend: LMDB | ||
| + } | ||
| +} | ||
| +layer { | ||
| + name: "conv1" | ||
| + type: "Convolution" | ||
| + bottom: "data" | ||
| + top: "conv1" | ||
| + param { | ||
| + lr_mult: 1 | ||
| + } | ||
| + param { | ||
| + lr_mult: 2 | ||
| + } | ||
| + convolution_param { | ||
| + num_output: 32 | ||
| + pad: 2 | ||
| + kernel_size: 5 | ||
| + stride: 1 | ||
| + weight_filler { | ||
| + type: "gaussian" | ||
| + std: 0.0001 | ||
| + } | ||
| + bias_filler { | ||
| + type: "constant" | ||
| + } | ||
| + } | ||
| +} | ||
| +layer { | ||
| + name: "pool1" | ||
| + type: "Pooling" | ||
| + bottom: "conv1" | ||
| + top: "pool1" | ||
| + pooling_param { | ||
| + pool: MAX | ||
| + kernel_size: 3 | ||
| + stride: 2 | ||
| + } | ||
| +} | ||
| + | ||
| + | ||
| + | ||
| +layer { | ||
| + name: "Sigmoid1" | ||
| + type: "Sigmoid" | ||
| + bottom: "pool1" | ||
| + top: "Sigmoid1" | ||
| +} | ||
| + | ||
| +layer { | ||
| + name: "conv2" | ||
| + type: "Convolution" | ||
| + bottom: "Sigmoid1" | ||
| + top: "conv2" | ||
| + param { | ||
| + lr_mult: 1 | ||
| + } | ||
| + param { | ||
| + lr_mult: 2 | ||
| + } | ||
| + convolution_param { | ||
| + num_output: 32 | ||
| + pad: 2 | ||
| + kernel_size: 5 | ||
| + stride: 1 | ||
| + weight_filler { | ||
| + type: "gaussian" | ||
| + std: 0.01 | ||
| + } | ||
| + bias_filler { | ||
| + type: "constant" | ||
| + } | ||
| + } | ||
| +} | ||
| + | ||
| + | ||
ducha-aiki
Contributor
|
||
| +layer { | ||
| + name: "Sigmoid2" | ||
| + type: "Sigmoid" | ||
| + bottom: "conv2" | ||
| + top: "Sigmoid2" | ||
| +} | ||
| +layer { | ||
| + name: "pool2" | ||
| + type: "Pooling" | ||
| + bottom: "Sigmoid2" | ||
| + top: "pool2" | ||
| + pooling_param { | ||
| + pool: AVE | ||
| + kernel_size: 3 | ||
| + stride: 2 | ||
| + } | ||
| +} | ||
| +layer { | ||
| + name: "conv3" | ||
| + type: "Convolution" | ||
| + bottom: "pool2" | ||
| + top: "conv3" | ||
| + convolution_param { | ||
| + num_output: 64 | ||
| + pad: 2 | ||
| + kernel_size: 5 | ||
| + stride: 1 | ||
| + weight_filler { | ||
| + type: "gaussian" | ||
| + std: 0.01 | ||
| + } | ||
| + bias_filler { | ||
| + type: "constant" | ||
| + } | ||
| + } | ||
| + param { | ||
| + lr_mult: 1 | ||
| + } | ||
| + param { | ||
| + lr_mult: 1 | ||
| + } | ||
| + | ||
| +} | ||
| + | ||
| +layer { | ||
| + name: "Sigmoid3" | ||
| + type: "Sigmoid" | ||
| + bottom: "conv3" | ||
| + top: "Sigmoid3" | ||
| +} | ||
| + | ||
| +layer { | ||
| + name: "pool3" | ||
| + type: "Pooling" | ||
| + bottom: "Sigmoid3" | ||
| + top: "pool3" | ||
| + pooling_param { | ||
| + pool: AVE | ||
| + kernel_size: 3 | ||
| + stride: 2 | ||
| + } | ||
| +} | ||
| + | ||
| +layer { | ||
| + name: "ip1" | ||
| + type: "InnerProduct" | ||
| + bottom: "pool3" | ||
| + top: "ip1" | ||
| + param { | ||
| + lr_mult: 1 | ||
| + decay_mult: 0 | ||
| + } | ||
| + param { | ||
| + lr_mult: 2 | ||
| + decay_mult: 0 | ||
| + } | ||
| + inner_product_param { | ||
| + num_output: 10 | ||
| + weight_filler { | ||
| + type: "gaussian" | ||
| + std: 0.01 | ||
| + } | ||
| + bias_filler { | ||
| + type: "constant" | ||
| + } | ||
| + } | ||
| +} | ||
| +layer { | ||
| + name: "accuracy" | ||
| + type: "Accuracy" | ||
| + bottom: "ip1" | ||
| + bottom: "label" | ||
| + top: "accuracy" | ||
| + include { | ||
| + phase: TEST | ||
| + } | ||
| +} | ||
| +layer { | ||
| + name: "loss" | ||
| + type: "SoftmaxWithLoss" | ||
| + bottom: "ip1" | ||
| + bottom: "label" | ||
| + top: "loss" | ||
| +} | ||
Oops, something went wrong.
Somewhat unrelated to batch normalization, but is it intentional to use
conv -> pooling -> sigmoidin the first layer andconv -> sigmoid -> poolingin the second layer?