Permalink
Browse files

but fix by @ducha-aiki

  • Loading branch information...
ChenglongChen committed Feb 19, 2015
1 parent 1d28ee6 commit 07033fd84e49b2645d33a4b087cba19291dd73c7
Showing with 592 additions and 579 deletions.
  1. +6 −0 README.md
  2. +52 −67 bn_layer.cpp
  3. +51 −69 bn_layer.cu
  4. +53 −15 caffe.proto
  5. +2 −2 common_layers.hpp
  6. +4 −2 layer_factory.cpp
  7. +424 −424 lenet_BN_sgd.log
View
@@ -3,3 +3,9 @@
This implementation of [Batch Normalization](http://arxiv.org/pdf/1502.03167v1.pdf) is based on [MVNLayer](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/mvn_layer.cpp) in Caffe.
To add this layer, you have to modify `common_layers.hpp`, `layer_factory.cpp` and `caffe.proto`. See [Caffe wiki](https://github.com/BVLC/caffe/wiki/Development)
# NOTE
This implementation is very basic which just performs batch normalization computation. Two pieces in [the paper](http://arxiv.org/pdf/1502.03167v1.pdf) are still missing here:
* fixed mean \& variance for inference
* per batch shuffling for thorough randomness
View
@@ -11,8 +11,6 @@ namespace caffe {
template <typename Dtype>
void BNLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
vector<Blob<Dtype>*>* top) {
(*top)[0]->Reshape(bottom[0]->num(), bottom[0]->channels(),
bottom[0]->height(), bottom[0]->width());
// Figure out the dimensions
N_ = bottom[0]->num();
@@ -21,6 +19,10 @@ namespace caffe {
W_ = bottom[0]->width();
var_eps_ = 1e-10;
// reshape blob
(*top)[0]->Reshape(N_, C_, H_, W_);
x_norm_.Reshape(N_, C_, H_, W_);
// mean
spatial_mean_.Reshape(N_, C_, 1, 1);
batch_mean_.Reshape(1, C_, 1, 1);
@@ -102,16 +104,20 @@ namespace caffe {
template <typename Dtype>
void BNLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
vector<Blob<Dtype>*>* top) {
const Dtype* bottom_data = bottom[0]->cpu_data();
Dtype* top_data = (*top)[0]->mutable_cpu_data();
const Dtype* const_bottom_data = bottom[0]->cpu_data();
const Dtype* const_top_data = (*top)[0]->cpu_data();
Dtype* top_data = (*top)[0]->mutable_cpu_data();
const Dtype* scale_data = this->blobs_[0]->cpu_data();
const Dtype* shift_data = this->blobs_[1]->cpu_data();
// put the squares of bottom into buffer_blob_
caffe_powx(bottom[0]->count(), bottom_data, Dtype(2),
caffe_powx(bottom[0]->count(), const_bottom_data, Dtype(2),
buffer_blob_.mutable_cpu_data());
// computes variance using var(X) = E(X^2) - (EX)^2
// EX across spatial
caffe_cpu_gemv<Dtype>(CblasNoTrans, N_ * C_, H_ * W_, Dtype(1. / (H_ * W_)), bottom_data,
caffe_cpu_gemv<Dtype>(CblasNoTrans, N_ * C_, H_ * W_, Dtype(1. / (H_ * W_)), const_bottom_data,
spatial_sum_multiplier_.cpu_data(), Dtype(0), spatial_mean_.mutable_cpu_data());
// EX across batch
caffe_cpu_gemv<Dtype>(CblasTrans, N_, C_, Dtype(1. / N_), spatial_mean_.cpu_data(),
@@ -138,7 +144,7 @@ namespace caffe {
spatial_mean_.cpu_data(), spatial_sum_multiplier_.cpu_data(), Dtype(0),
buffer_blob_.mutable_cpu_data());
caffe_add(buffer_blob_.count(), bottom_data, buffer_blob_.cpu_data(), top_data);
caffe_add(buffer_blob_.count(), const_bottom_data, buffer_blob_.cpu_data(), top_data);
// normalize variance
caffe_add_scalar(batch_variance_.count(), var_eps_, batch_variance_.mutable_cpu_data());
@@ -152,50 +158,73 @@ namespace caffe {
spatial_variance_.cpu_data(), spatial_sum_multiplier_.cpu_data(), Dtype(0),
buffer_blob_.mutable_cpu_data());
caffe_div(buffer_blob_.count(), top_data, buffer_blob_.cpu_data(), top_data);
caffe_div(buffer_blob_.count(), const_top_data, buffer_blob_.cpu_data(), top_data);
// save x_norm
caffe_copy(buffer_blob_.count(), const_top_data, x_norm_.mutable_cpu_data());
// scale
caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1),
batch_sum_multiplier_.cpu_data(), this->blobs_[0]->cpu_data(), Dtype(0),
batch_sum_multiplier_.cpu_data(), scale_data, Dtype(0),
spatial_variance_.mutable_cpu_data());
caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_ * C_, H_ * W_, 1, Dtype(1),
spatial_variance_.cpu_data(), spatial_sum_multiplier_.cpu_data(), Dtype(0),
buffer_blob_.mutable_cpu_data());
caffe_mul(buffer_blob_.count(), top_data, buffer_blob_.cpu_data(), top_data);
caffe_mul(buffer_blob_.count(), const_top_data, buffer_blob_.cpu_data(), top_data);
// shift
caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1),
batch_sum_multiplier_.cpu_data(), this->blobs_[1]->cpu_data(), Dtype(0),
batch_sum_multiplier_.cpu_data(), shift_data, Dtype(0),
spatial_mean_.mutable_cpu_data());
caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_ * C_, H_ * W_, 1, Dtype(1),
spatial_mean_.cpu_data(), spatial_sum_multiplier_.cpu_data(), Dtype(0),
buffer_blob_.mutable_cpu_data());
caffe_add(buffer_blob_.count(), top_data, buffer_blob_.cpu_data(), top_data);
caffe_add(buffer_blob_.count(), const_top_data, buffer_blob_.cpu_data(), top_data);
}
template <typename Dtype>
void BNLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down,
vector<Blob<Dtype>*>* bottom) {
const Dtype* top_diff = top[0]->cpu_diff();
const Dtype* top_data = top[0]->cpu_data();
const Dtype* bottom_data = (*bottom)[0]->cpu_data();
const Dtype* const_bottom_diff = (*bottom)[0]->cpu_diff();
Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
const Dtype* const_top_diff = top[0]->cpu_diff();
Dtype* scale_diff = this->blobs_[0]->mutable_cpu_diff();
Dtype* shift_diff = this->blobs_[1]->mutable_cpu_diff();
const Dtype* scale_data = this->blobs_[0]->cpu_data();
// gradient w.r.t. scale
caffe_mul(buffer_blob_.count(), x_norm_.cpu_data(), const_top_diff, buffer_blob_.mutable_cpu_data());
// EX across spatial
caffe_cpu_gemv<Dtype>(CblasNoTrans, N_ * C_, H_ * W_, Dtype(1), buffer_blob_.cpu_data(),
spatial_sum_multiplier_.cpu_data(), Dtype(0), spatial_variance_.mutable_cpu_data());
// EX across batch
caffe_cpu_gemv<Dtype>(CblasTrans, N_, C_, Dtype(1), spatial_variance_.cpu_data(),
batch_sum_multiplier_.cpu_data(), Dtype(0), scale_diff);
// gradient w.r.t. shift
// EX across spatial
caffe_cpu_gemv<Dtype>(CblasNoTrans, N_ * C_, H_ * W_, Dtype(1), const_top_diff,
spatial_sum_multiplier_.cpu_data(), Dtype(0), spatial_mean_.mutable_cpu_data());
// EX across batch
caffe_cpu_gemv<Dtype>(CblasTrans, N_, C_, Dtype(1), spatial_mean_.cpu_data(),
batch_sum_multiplier_.cpu_data(), Dtype(0), shift_diff);
// put scale * top_diff to buffer_blob_
caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1),
batch_sum_multiplier_.cpu_data(), this->blobs_[0]->cpu_data(), Dtype(0),
batch_sum_multiplier_.cpu_data(), scale_data, Dtype(0),
spatial_variance_.mutable_cpu_data());
caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_ * C_, H_ * W_, 1, Dtype(1),
spatial_variance_.cpu_data(), spatial_sum_multiplier_.cpu_data(), Dtype(0),
buffer_blob_.mutable_cpu_data());
caffe_mul(buffer_blob_.count(), top_diff, buffer_blob_.cpu_data(), buffer_blob_.mutable_cpu_data());
caffe_mul(buffer_blob_.count(), const_top_diff, buffer_blob_.cpu_data(), buffer_blob_.mutable_cpu_data());
// use new top diff for computation
caffe_mul(buffer_blob_.count(), top_data, buffer_blob_.cpu_data(), bottom_diff);
caffe_mul(buffer_blob_.count(), x_norm_.cpu_data(), buffer_blob_.cpu_data(), bottom_diff);
// EX across spatial
caffe_cpu_gemv<Dtype>(CblasNoTrans, N_ * C_, H_ * W_, Dtype(1), bottom_diff,
caffe_cpu_gemv<Dtype>(CblasNoTrans, N_ * C_, H_ * W_, Dtype(1), const_bottom_diff,
spatial_sum_multiplier_.cpu_data(), Dtype(0), spatial_mean_.mutable_cpu_data());
// EX across batch
caffe_cpu_gemv<Dtype>(CblasTrans, N_, C_, Dtype(1), spatial_mean_.cpu_data(),
@@ -208,7 +237,7 @@ namespace caffe {
spatial_mean_.cpu_data(), spatial_sum_multiplier_.cpu_data(), Dtype(0),
bottom_diff);
caffe_mul(buffer_blob_.count(), top_data, bottom_diff, bottom_diff);
caffe_mul(buffer_blob_.count(), x_norm_.cpu_data(), const_bottom_diff, bottom_diff);
// EX across spatial
caffe_cpu_gemv<Dtype>(CblasNoTrans, N_ * C_, H_ * W_, Dtype(1), buffer_blob_.cpu_data(),
@@ -226,61 +255,17 @@ namespace caffe {
caffe_cpu_axpby(buffer_blob_.count(), Dtype(1), buffer_blob_.cpu_data(), Dtype(-1. / (N_ * H_ * W_)),
bottom_diff);
// put the squares of bottom into buffer_blob_
caffe_powx(buffer_blob_.count(), bottom_data, Dtype(2),
buffer_blob_.mutable_cpu_data());
// computes variance using var(X) = E(X^2) - (EX)^2
// EX across spatial
caffe_cpu_gemv<Dtype>(CblasNoTrans, N_ * C_, H_ * W_, Dtype(1. / (H_ * W_)), bottom_data,
spatial_sum_multiplier_.cpu_data(), Dtype(0), spatial_mean_.mutable_cpu_data());
// EX across batch
caffe_cpu_gemv<Dtype>(CblasTrans, N_, C_, Dtype(1. / N_), spatial_mean_.cpu_data(),
batch_sum_multiplier_.cpu_data(), Dtype(0), batch_mean_.mutable_cpu_data());
// E(X^2) across spatial
caffe_cpu_gemv<Dtype>(CblasNoTrans, N_ * C_, H_ * W_, Dtype(1. / (H_ * W_)), buffer_blob_.cpu_data(),
spatial_sum_multiplier_.cpu_data(), Dtype(0), spatial_variance_.mutable_cpu_data());
// E(X^2) across batch
caffe_cpu_gemv<Dtype>(CblasTrans, N_, C_, Dtype(1. / N_), spatial_variance_.cpu_data(),
batch_sum_multiplier_.cpu_data(), Dtype(0), batch_variance_.mutable_cpu_data());
caffe_powx(batch_mean_.count(), batch_mean_.cpu_data(), Dtype(2),
buffer_blob_.mutable_cpu_data()); // (EX)^2
caffe_sub(batch_mean_.count(), batch_variance_.cpu_data(), buffer_blob_.cpu_data(),
batch_variance_.mutable_cpu_data()); // variance
// normalize variance
caffe_add_scalar(batch_variance_.count(), var_eps_, batch_variance_.mutable_cpu_data());
caffe_powx(batch_variance_.count(), batch_variance_.cpu_data(), Dtype(0.5),
batch_variance_.mutable_cpu_data());
// variance normalization
caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1),
batch_sum_multiplier_.cpu_data(), batch_variance_.cpu_data(), Dtype(0),
spatial_variance_.mutable_cpu_data());
caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, N_ * C_, H_ * W_, 1, Dtype(1),
spatial_variance_.cpu_data(), spatial_sum_multiplier_.cpu_data(), Dtype(0),
buffer_blob_.mutable_cpu_data());
caffe_div(buffer_blob_.count(), bottom_diff, buffer_blob_.cpu_data(), bottom_diff);
// gradient w.r.t. scale
caffe_mul(buffer_blob_.count(), top_data, top_diff, buffer_blob_.mutable_cpu_data());
// EX across spatial
caffe_cpu_gemv<Dtype>(CblasNoTrans, N_ * C_, H_ * W_, Dtype(1), buffer_blob_.cpu_data(),
spatial_sum_multiplier_.cpu_data(), Dtype(0), spatial_variance_.mutable_cpu_data());
// EX across batch
caffe_cpu_gemv<Dtype>(CblasTrans, N_, C_, Dtype(1), spatial_variance_.cpu_data(),
batch_sum_multiplier_.cpu_data(), Dtype(0), this->blobs_[0]->mutable_cpu_diff());
caffe_div(buffer_blob_.count(), const_bottom_diff, buffer_blob_.cpu_data(), bottom_diff);
// gradient w.r.t. shift
// EX across spatial
caffe_cpu_gemv<Dtype>(CblasNoTrans, N_ * C_, H_ * W_, Dtype(1), top_diff,
spatial_sum_multiplier_.cpu_data(), Dtype(0), spatial_mean_.mutable_cpu_data());
// EX across batch
caffe_cpu_gemv<Dtype>(CblasTrans, N_, C_, Dtype(1), spatial_mean_.cpu_data(),
batch_sum_multiplier_.cpu_data(), Dtype(0), this->blobs_[1]->mutable_cpu_diff());
}
#ifdef CPU_ONLY
Oops, something went wrong.

0 comments on commit 07033fd

Please sign in to comment.