-
Notifications
You must be signed in to change notification settings - Fork 18.7k
/
scale_layer.cu
135 lines (127 loc) · 5.5 KB
/
scale_layer.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#include <cfloat>
#include <vector>
#include "caffe/layers/scale_layer.hpp"
#include "caffe/util/math_functions.hpp"
namespace caffe {
template <typename Dtype>
__global__ void ScaleForward(const int n, const Dtype* in,
const Dtype* scale, const int scale_dim, const int inner_dim,
Dtype* out) {
CUDA_KERNEL_LOOP(index, n) {
const int scale_index = (index / inner_dim) % scale_dim;
out[index] = in[index] * scale[scale_index];
}
}
template <typename Dtype>
__global__ void ScaleBiasForward(const int n, const Dtype* in,
const Dtype* scale, const Dtype* bias,
const int scale_dim, const int inner_dim, Dtype* out) {
CUDA_KERNEL_LOOP(index, n) {
const int scale_index = (index / inner_dim) % scale_dim;
out[index] = in[index] * scale[scale_index] + bias[scale_index];
}
}
template <typename Dtype>
void ScaleLayer<Dtype>::Forward_gpu(
const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
const int count = top[0]->count();
const Dtype* bottom_data = bottom[0]->gpu_data();
if (bottom[0] == top[0]) {
// in-place computation; need to store bottom data before overwriting it.
// Note that this is only necessary for Backward; we could skip this if not
// doing Backward, but Caffe currently provides no way of knowing whether
// we'll need to do Backward at the time of the Forward call.
caffe_copy(bottom[0]->count(), bottom[0]->gpu_data(),
temp_.mutable_gpu_data());
}
const Dtype* scale_data =
((bottom.size() > 1) ? bottom[1] : this->blobs_[0].get())->gpu_data();
Dtype* top_data = top[0]->mutable_gpu_data();
if (bias_layer_) {
const Dtype* bias_data = this->blobs_[bias_param_id_]->gpu_data();
ScaleBiasForward<Dtype> // NOLINT_NEXT_LINE(whitespace/operators)
<<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
count, bottom_data, scale_data, bias_data, scale_dim_, inner_dim_,
top_data);
} else {
ScaleForward<Dtype> // NOLINT_NEXT_LINE(whitespace/operators)
<<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
count, bottom_data, scale_data, scale_dim_, inner_dim_, top_data);
}
}
template <typename Dtype>
void ScaleLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
if (bias_layer_ &&
this->param_propagate_down_[this->param_propagate_down_.size() - 1]) {
bias_layer_->Backward(top, bias_propagate_down_, bias_bottom_vec_);
}
const bool scale_param = (bottom.size() == 1);
Blob<Dtype>* scale = scale_param ? this->blobs_[0].get() : bottom[1];
if ((!scale_param && propagate_down[1]) ||
(scale_param && this->param_propagate_down_[0])) {
const Dtype* top_diff = top[0]->gpu_diff();
const bool in_place = (bottom[0] == top[0]);
const Dtype* bottom_data = (in_place ? &temp_ : bottom[0])->gpu_data();
// Hack: store big eltwise product in bottom[0] diff, except in the special
// case where this layer itself does the eltwise product, in which case we
// can store it directly in the scale diff, and we're done.
// If we're computing in-place (and not doing eltwise computation), this
// hack doesn't work and we store the product in temp_.
const bool is_eltwise = (bottom[0]->count() == scale->count());
Dtype* product = (is_eltwise ? scale->mutable_gpu_diff() :
(in_place ? temp_.mutable_gpu_data() : bottom[0]->mutable_gpu_diff()));
caffe_gpu_mul(top[0]->count(), top_diff, bottom_data, product);
if (!is_eltwise) {
Dtype* sum_result = NULL;
if (inner_dim_ == 1) {
sum_result = product;
} else if (sum_result_.count() == 1) {
const Dtype* sum_mult = sum_multiplier_.gpu_data();
Dtype* scale_diff = scale->mutable_cpu_diff();
if (scale_param) {
Dtype result;
caffe_gpu_dot(inner_dim_, product, sum_mult, &result);
*scale_diff += result;
} else {
caffe_gpu_dot(inner_dim_, product, sum_mult, scale_diff);
}
} else {
const Dtype* sum_mult = sum_multiplier_.gpu_data();
sum_result = (outer_dim_ == 1) ?
scale->mutable_gpu_diff() : sum_result_.mutable_gpu_data();
caffe_gpu_gemv(CblasNoTrans, sum_result_.count(), inner_dim_,
Dtype(1), product, sum_mult, Dtype(0), sum_result);
}
if (outer_dim_ != 1) {
const Dtype* sum_mult = sum_multiplier_.gpu_data();
if (scale_dim_ == 1) {
Dtype* scale_diff = scale->mutable_cpu_diff();
if (scale_param) {
Dtype result;
caffe_gpu_dot(outer_dim_, sum_mult, sum_result, &result);
*scale_diff += result;
} else {
caffe_gpu_dot(outer_dim_, sum_mult, sum_result, scale_diff);
}
} else {
Dtype* scale_diff = scale->mutable_gpu_diff();
caffe_gpu_gemv(CblasTrans, outer_dim_, scale_dim_,
Dtype(1), sum_result, sum_mult, Dtype(scale_param),
scale_diff);
}
}
}
}
if (propagate_down[0]) {
const int count = top[0]->count();
const Dtype* top_diff = top[0]->gpu_diff();
const Dtype* scale_data = scale->gpu_data();
Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
ScaleForward<Dtype> // NOLINT_NEXT_LINE(whitespace/operators)
<<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
count, top_diff, scale_data, scale_dim_, inner_dim_, bottom_diff);
}
}
INSTANTIATE_LAYER_GPU_FUNCS(ScaleLayer);
} // namespace caffe