@@ -25,7 +25,14 @@ void SoftmaxWithLossLayer<Dtype>::LayerSetUp(
if (has_ignore_label_) {
ignore_label_ = this ->layer_param_.loss_param ().ignore_label ();
}
- normalize_ = this ->layer_param_.loss_param ().normalize ();
+ if (!this ->layer_param_.loss_param ().has_normalization () &&
+ this ->layer_param_.loss_param ().has_normalize ()) {
+ normalization_ = this ->layer_param_.loss_param ().normalize () ?
+ LossParameter_NormalizationMode_VALID :
+ LossParameter_NormalizationMode_BATCH_SIZE;
+ } else {
+ normalization_ = this ->layer_param_.loss_param ().normalization ();
+ }
}
template <typename Dtype>
@@ -49,6 +56,36 @@ void SoftmaxWithLossLayer<Dtype>::Reshape(
}
template <typename Dtype>
+Dtype SoftmaxWithLossLayer<Dtype>::get_normalizer(
+ LossParameter_NormalizationMode normalization_mode, int valid_count) {
+ Dtype normalizer;
+ switch (normalization_mode) {
+ case LossParameter_NormalizationMode_FULL:
+ normalizer = Dtype (outer_num_ * inner_num_);
+ break ;
+ case LossParameter_NormalizationMode_VALID:
+ if (valid_count == -1 ) {
+ normalizer = Dtype (outer_num_ * inner_num_);
+ } else {
+ normalizer = Dtype (valid_count);
+ }
+ break ;
+ case LossParameter_NormalizationMode_BATCH_SIZE:
+ normalizer = Dtype (outer_num_);
+ break ;
+ case LossParameter_NormalizationMode_NONE:
+ normalizer = Dtype (1 );
+ break ;
+ default :
+ LOG (FATAL) << " Unknown normalization mode: "
+ << LossParameter_NormalizationMode_Name (normalization_mode);
+ }
+ // Some users will have no labels for some examples in order to 'turn off' a
+ // particular loss in a multi-task setup. The max prevents NaNs in that case.
+ return std::max (Dtype (1.0 ), normalizer);
+}
+
+template <typename Dtype>
void SoftmaxWithLossLayer<Dtype>::Forward_cpu(
const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
// The forward pass computes the softmax prob values.
@@ -71,11 +108,7 @@ void SoftmaxWithLossLayer<Dtype>::Forward_cpu(
++count;
}
}
- if (normalize_) {
- top[0 ]->mutable_cpu_data ()[0 ] = loss / count;
- } else {
- top[0 ]->mutable_cpu_data ()[0 ] = loss / outer_num_;
- }
+ top[0 ]->mutable_cpu_data ()[0 ] = loss / get_normalizer (normalization_, count);
if (top.size () == 2 ) {
top[1 ]->ShareData (prob_);
}
@@ -109,12 +142,9 @@ void SoftmaxWithLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
}
}
// Scale gradient
- const Dtype loss_weight = top[0 ]->cpu_diff ()[0 ];
- if (normalize_) {
- caffe_scal (prob_.count (), loss_weight / count, bottom_diff);
- } else {
- caffe_scal (prob_.count (), loss_weight / outer_num_, bottom_diff);
- }
+ Dtype loss_weight = top[0 ]->cpu_diff ()[0 ] /
+ get_normalizer (normalization_, count);
+ caffe_scal (prob_.count (), loss_weight, bottom_diff);
}
}
Potential divide-by-zero here (and other similar locations).
Yes, the old code had the same problem, but it might as well get fixed here, since a separate PR fixing the divide-by-zero would conflict with this one.