Groomed version of @fyu's dilated convolution #3487

Merged
merged 6 commits into from Dec 28, 2015
@@ -68,6 +68,8 @@ class BaseConvolutionLayer : public Layer<Dtype> {
Blob<int> stride_;
/// @brief The spatial dimensions of the padding.
Blob<int> pad_;
+ /// @brief The spatial dimensions of the dilation.
+ Blob<int> dilation_;
/// @brief The spatial dimensions of the convolution input.
Blob<int> conv_input_shape_;
/// @brief The spatial dimensions of the col_buffer.
@@ -99,11 +101,12 @@ class BaseConvolutionLayer : public Layer<Dtype> {
conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2],
kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
pad_.cpu_data()[0], pad_.cpu_data()[1],
- stride_.cpu_data()[0], stride_.cpu_data()[1], col_buff);
+ stride_.cpu_data()[0], stride_.cpu_data()[1],
+ dilation_.cpu_data()[0], dilation_.cpu_data()[1], col_buff);
} else {
im2col_nd_cpu(data, num_spatial_axes_, conv_input_shape_.cpu_data(),
col_buffer_shape_.data(), kernel_shape_.cpu_data(),
- pad_.cpu_data(), stride_.cpu_data(), col_buff);
+ pad_.cpu_data(), stride_.cpu_data(), dilation_.cpu_data(), col_buff);
}
}
inline void conv_col2im_cpu(const Dtype* col_buff, Dtype* data) {
@@ -112,11 +115,12 @@ class BaseConvolutionLayer : public Layer<Dtype> {
conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2],
kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
pad_.cpu_data()[0], pad_.cpu_data()[1],
- stride_.cpu_data()[0], stride_.cpu_data()[1], data);
+ stride_.cpu_data()[0], stride_.cpu_data()[1],
+ dilation_.cpu_data()[0], dilation_.cpu_data()[1], data);
} else {
col2im_nd_cpu(col_buff, num_spatial_axes_, conv_input_shape_.cpu_data(),
col_buffer_shape_.data(), kernel_shape_.cpu_data(),
- pad_.cpu_data(), stride_.cpu_data(), data);
+ pad_.cpu_data(), stride_.cpu_data(), dilation_.cpu_data(), data);
}
}
#ifndef CPU_ONLY
@@ -126,12 +130,13 @@ class BaseConvolutionLayer : public Layer<Dtype> {
conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2],
kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
pad_.cpu_data()[0], pad_.cpu_data()[1],
- stride_.cpu_data()[0], stride_.cpu_data()[1], col_buff);
+ stride_.cpu_data()[0], stride_.cpu_data()[1],
+ dilation_.cpu_data()[0], dilation_.cpu_data()[1], col_buff);
} else {
im2col_nd_gpu(data, num_spatial_axes_, num_kernels_im2col_,
conv_input_shape_.gpu_data(), col_buffer_.gpu_shape(),
kernel_shape_.gpu_data(), pad_.gpu_data(),
- stride_.gpu_data(), col_buff);
+ stride_.gpu_data(), dilation_.gpu_data(), col_buff);
}
}
inline void conv_col2im_gpu(const Dtype* col_buff, Dtype* data) {
@@ -140,12 +145,13 @@ class BaseConvolutionLayer : public Layer<Dtype> {
conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2],
kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
pad_.cpu_data()[0], pad_.cpu_data()[1],
- stride_.cpu_data()[0], stride_.cpu_data()[1], data);
+ stride_.cpu_data()[0], stride_.cpu_data()[1],
+ dilation_.cpu_data()[0], dilation_.cpu_data()[1], data);
} else {
col2im_nd_gpu(col_buff, num_spatial_axes_, num_kernels_col2im_,
conv_input_shape_.gpu_data(), col_buffer_.gpu_shape(),
kernel_shape_.gpu_data(), pad_.gpu_data(), stride_.gpu_data(),
- data);
+ dilation_.gpu_data(), data);
}
}
#endif
@@ -44,6 +44,9 @@ class ConvolutionLayer : public BaseConvolutionLayer<Dtype> {
* convolution, given by pad for equal dimensions or pad_h and pad_w for
* different padding. Input padding is computed implicitly instead of
* actually padding.
+ * - dilation (\b optional, default 1). The filter
+ * dilation, given by dilation_size for equal dimensions for different
+ * dilation. By default the convolution has dilation 1.
* - group (\b optional, default 1). The number of filter groups. Group
* convolution is a method for reducing parameterization by selectively
* connecting input and output channels. The input and output channel dimensions must be divisible
@@ -46,6 +46,8 @@ class Im2colLayer : public Layer<Dtype> {
Blob<int> stride_;
/// @brief The spatial dimensions of the padding.
Blob<int> pad_;
+ /// @brief The spatial dimensions of the dilation.
+ Blob<int> dilation_;
int num_spatial_axes_;
int bottom_dim_;
@@ -7,49 +7,53 @@ template <typename Dtype>
void im2col_nd_cpu(const Dtype* data_im, const int num_spatial_axes,
const int* im_shape, const int* col_shape,
const int* kernel_shape, const int* pad, const int* stride,
- Dtype* data_col);
+ const int* dilation, Dtype* data_col);
template <typename Dtype>
void im2col_cpu(const Dtype* data_im, const int channels,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h,
- const int stride_w, Dtype* data_col);
+ const int stride_w, const int dilation_h, const int dilation_w,
+ Dtype* data_col);
template <typename Dtype>
void col2im_nd_cpu(const Dtype* data_col, const int num_spatial_axes,
const int* im_shape, const int* col_shape,
const int* kernel_shape, const int* pad, const int* stride,
- Dtype* data_im);
+ const int* dilation, Dtype* data_im);
template <typename Dtype>
void col2im_cpu(const Dtype* data_col, const int channels,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h,
- const int stride_w, Dtype* data_im);
+ const int stride_w, const int dilation_h, const int dilation_w,
+ Dtype* data_im);
template <typename Dtype>
void im2col_nd_gpu(const Dtype* data_im, const int num_spatial_axes,
const int col_size, const int* im_shape, const int* col_shape,
const int* kernel_shape, const int* pad, const int* stride,
- Dtype* data_col);
+ const int* dilation, Dtype* data_col);
template <typename Dtype>
void im2col_gpu(const Dtype* data_im, const int channels,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h,
- const int stride_w, Dtype* data_col);
+ const int stride_w, const int dilation_h, const int dilation_w,
+ Dtype* data_col);
template <typename Dtype>
void col2im_nd_gpu(const Dtype* data_col, const int num_spatial_axes,
const int im_size, const int* im_shape, const int* col_shape,
const int* kernel_shape, const int* pad, const int* stride,
- Dtype* data_im);
+ const int* dilation, Dtype* data_im);
template <typename Dtype>
void col2im_gpu(const Dtype* data_col, const int channels,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w, const int stride_h,
- const int stride_w, Dtype* data_im);
+ const int stride_w, const int dilation_h, const int dilation_w,
+ Dtype* data_im);
} // namespace caffe
@@ -37,17 +37,32 @@ namespace caffe {
template <typename Dtype>
shared_ptr<Layer<Dtype> > GetConvolutionLayer(
const LayerParameter& param) {
- ConvolutionParameter_Engine engine = param.convolution_param().engine();
+ ConvolutionParameter conv_param = param.convolution_param();
+ ConvolutionParameter_Engine engine = conv_param.engine();
+#ifdef USE_CUDNN
+ bool use_dilation = false;
+ for (int i = 0; i < conv_param.dilation_size(); ++i) {
+ if (conv_param.dilation(i) > 1) {
+ use_dilation = true;
+ }
+ }
+#endif
if (engine == ConvolutionParameter_Engine_DEFAULT) {
engine = ConvolutionParameter_Engine_CAFFE;
#ifdef USE_CUDNN
- engine = ConvolutionParameter_Engine_CUDNN;
+ if (!use_dilation) {
+ engine = ConvolutionParameter_Engine_CUDNN;
+ }
#endif
}
if (engine == ConvolutionParameter_Engine_CAFFE) {
return shared_ptr<Layer<Dtype> >(new ConvolutionLayer<Dtype>(param));
#ifdef USE_CUDNN
} else if (engine == ConvolutionParameter_Engine_CUDNN) {
+ if (use_dilation) {
+ LOG(FATAL) << "CuDNN doesn't support the dilated convolution at Layer "
+ << param.name();
+ }
return shared_ptr<Layer<Dtype> >(new CuDNNConvolutionLayer<Dtype>(param));
#endif
} else {
@@ -36,7 +36,7 @@ void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
CHECK(num_kernel_dims == 1 || num_kernel_dims == num_spatial_axes_)
<< "kernel_size must be specified once, or once per spatial dimension "
<< "(kernel_size specified " << num_kernel_dims << " times; "
- << num_spatial_axes_ << " spatial dims);";
+ << num_spatial_axes_ << " spatial dims).";
for (int i = 0; i < num_spatial_axes_; ++i) {
kernel_shape_data[i] =
conv_param.kernel_size((num_kernel_dims == 1) ? 0 : i);
@@ -61,7 +61,7 @@ void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
num_stride_dims == num_spatial_axes_)
<< "stride must be specified once, or once per spatial dimension "
<< "(stride specified " << num_stride_dims << " times; "
- << num_spatial_axes_ << " spatial dims);";
+ << num_spatial_axes_ << " spatial dims).";
const int kDefaultStride = 1;
for (int i = 0; i < num_spatial_axes_; ++i) {
stride_data[i] = (num_stride_dims == 0) ? kDefaultStride :
@@ -85,13 +85,27 @@ void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
num_pad_dims == num_spatial_axes_)
<< "pad must be specified once, or once per spatial dimension "
<< "(pad specified " << num_pad_dims << " times; "
- << num_spatial_axes_ << " spatial dims);";
+ << num_spatial_axes_ << " spatial dims).";
const int kDefaultPad = 0;
for (int i = 0; i < num_spatial_axes_; ++i) {
pad_data[i] = (num_pad_dims == 0) ? kDefaultPad :
conv_param.pad((num_pad_dims == 1) ? 0 : i);
}
}
+ // Setup dilation dimensions (dilation_).
+ dilation_.Reshape(spatial_dim_blob_shape);
+ int* dilation_data = dilation_.mutable_cpu_data();
+ const int num_dilation_dims = conv_param.dilation_size();
+ CHECK(num_dilation_dims == 0 || num_dilation_dims == 1 ||
+ num_dilation_dims == num_spatial_axes_)
+ << "dilation must be specified once, or once per spatial dimension "
+ << "(dilation specified " << num_dilation_dims << " times; "
+ << num_spatial_axes_ << " spatial dims).";
+ const int kDefaultDilation = 1;
+ for (int i = 0; i < num_spatial_axes_; ++i) {
+ dilation_data[i] = (num_dilation_dims == 0) ? kDefaultDilation :
+ conv_param.dilation((num_dilation_dims == 1) ? 0 : i);
+ }
// Special case: im2col is the identity for 1x1 convolution with stride 1
// and no padding, so flag for skipping the buffer and transformation.
is_1x1_ = true;
@@ -9,11 +9,13 @@ void ConvolutionLayer<Dtype>::compute_output_shape() {
const int* kernel_shape_data = this->kernel_shape_.cpu_data();
const int* stride_data = this->stride_.cpu_data();
const int* pad_data = this->pad_.cpu_data();
+ const int* dilation_data = this->dilation_.cpu_data();
this->output_shape_.clear();
for (int i = 0; i < this->num_spatial_axes_; ++i) {
// i + 1 to skip channel axis
const int input_dim = this->input_shape(i + 1);
- const int output_dim = (input_dim + 2 * pad_data[i] - kernel_shape_data[i])
+ const int kernel_extent = dilation_data[i] * (kernel_shape_data[i] - 1) + 1;
+ const int output_dim = (input_dim + 2 * pad_data[i] - kernel_extent)
/ stride_data[i] + 1;
this->output_shape_.push_back(output_dim);
}
@@ -9,12 +9,14 @@ void DeconvolutionLayer<Dtype>::compute_output_shape() {
const int* kernel_shape_data = this->kernel_shape_.cpu_data();
const int* stride_data = this->stride_.cpu_data();
const int* pad_data = this->pad_.cpu_data();
+ const int* dilation_data = this->dilation_.cpu_data();
this->output_shape_.clear();
for (int i = 0; i < this->num_spatial_axes_; ++i) {
// i + 1 to skip channel axis
const int input_dim = this->input_shape(i + 1);
+ const int kernel_extent = dilation_data[i] * (kernel_shape_data[i] - 1) + 1;
const int output_dim = stride_data[i] * (input_dim - 1)
- + kernel_shape_data[i] - 2 * pad_data[i];
+ + kernel_extent - 2 * pad_data[i];
this->output_shape_.push_back(output_dim);
}
}
@@ -87,6 +87,20 @@ void Im2colLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
conv_param.pad((num_pad_dims == 1) ? 0 : i);
}
}
+ // Setup dilation dimensions (dilation_).
+ dilation_.Reshape(dim_blob_shape);
+ int* dilation_data = dilation_.mutable_cpu_data();
+ const int num_dilation_dims = conv_param.dilation_size();
+ CHECK(num_dilation_dims == 0 || num_dilation_dims == 1 ||
+ num_dilation_dims == num_spatial_axes_)
+ << "dilation must be specified once, or once per spatial dimension "
+ << "(dilation specified " << num_dilation_dims << " times; "
+ << num_spatial_axes_ << " spatial dims).";
+ const int kDefaultDilation = 1;
+ for (int i = 0; i < num_spatial_axes_; ++i) {
+ dilation_data[i] = (num_dilation_dims == 0) ? kDefaultDilation :
+ conv_param.dilation((num_dilation_dims == 1) ? 0 : i);
+ }
}
template <typename Dtype>
@@ -96,10 +110,12 @@ void Im2colLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
const int* kernel_shape_data = kernel_shape_.cpu_data();
const int* stride_data = stride_.cpu_data();
const int* pad_data = pad_.cpu_data();
+ const int* dilation_data = dilation_.cpu_data();
for (int i = 0; i < num_spatial_axes_; ++i) {
top_shape[channel_axis_] *= kernel_shape_data[i];
const int input_dim = bottom[0]->shape(channel_axis_ + i + 1);
- const int output_dim = (input_dim + 2 * pad_data[i] - kernel_shape_data[i])
+ const int kernel_extent = dilation_data[i] * (kernel_shape_data[i] - 1) + 1;
+ const int output_dim = (input_dim + 2 * pad_data[i] - kernel_extent)
/ stride_data[i] + 1;
top_shape[channel_axis_ + i + 1] = output_dim;
}
@@ -122,20 +138,22 @@ void Im2colLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
DCHECK_EQ(kernel_shape_.count(), num_spatial_axes_);
DCHECK_EQ(pad_.count(), num_spatial_axes_);
DCHECK_EQ(stride_.count(), num_spatial_axes_);
+ DCHECK_EQ(dilation_.count(), num_spatial_axes_);
if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
im2col_cpu(bottom_data + n * bottom_dim_, channels_,
bottom[0]->shape(channel_axis_ + 1),
bottom[0]->shape(channel_axis_ + 2),
kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
pad_.cpu_data()[0], pad_.cpu_data()[1],
stride_.cpu_data()[0], stride_.cpu_data()[1],
+ dilation_.cpu_data()[0], dilation_.cpu_data()[1],
top_data + n * top_dim_);
} else {
im2col_nd_cpu(bottom_data + n * bottom_dim_, num_spatial_axes_,
bottom[0]->shape().data() + channel_axis_,
top[0]->shape().data() + channel_axis_,
kernel_shape_.cpu_data(), pad_.cpu_data(), stride_.cpu_data(),
- top_data + n * top_dim_);
+ dilation_.cpu_data(), top_data + n * top_dim_);
}
}
}
@@ -153,13 +171,14 @@ void Im2colLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
pad_.cpu_data()[0], pad_.cpu_data()[1],
stride_.cpu_data()[0], stride_.cpu_data()[1],
+ dilation_.cpu_data()[0], dilation_.cpu_data()[1],
bottom_diff + n * bottom_dim_);
} else {
col2im_nd_cpu(top_diff + n * top_dim_, num_spatial_axes_,
bottom[0]->shape().data() + channel_axis_,
top[0]->shape().data() + channel_axis_,
kernel_shape_.cpu_data(), pad_.cpu_data(), stride_.cpu_data(),
- bottom_diff + n * bottom_dim_);
+ dilation_.cpu_data(), bottom_diff + n * bottom_dim_);
}
}
}
@@ -19,13 +19,14 @@ void Im2colLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
pad_.cpu_data()[0], pad_.cpu_data()[1],
stride_.cpu_data()[0], stride_.cpu_data()[1],
+ dilation_.cpu_data()[0], dilation_.cpu_data()[1],
top_data + n * top_dim_);
} else {
im2col_nd_gpu(bottom_data + n * bottom_dim_, num_spatial_axes_,
num_kernels, bottom[0]->gpu_shape() + channel_axis_,
top[0]->gpu_shape() + channel_axis_,
kernel_shape_.gpu_data(), pad_.gpu_data(), stride_.gpu_data(),
- top_data + n * top_dim_);
+ dilation_.gpu_data(), top_data + n * top_dim_);
}
}
}
@@ -43,13 +44,14 @@ void Im2colLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
pad_.cpu_data()[0], pad_.cpu_data()[1],
stride_.cpu_data()[0], stride_.cpu_data()[1],
+ dilation_.cpu_data()[0], dilation_.cpu_data()[1],
bottom_diff + n * bottom_dim_);
} else {
col2im_nd_gpu(top_diff + n * top_dim_, num_spatial_axes_, bottom_dim_,
bottom[0]->gpu_shape() + channel_axis_,
top[0]->gpu_shape() + channel_axis_,
kernel_shape_.gpu_data(), pad_.gpu_data(), stride_.gpu_data(),
- bottom_diff + n * bottom_dim_);
+ dilation_.gpu_data(), bottom_diff + n * bottom_dim_);
}
}
}
@@ -518,6 +518,10 @@ message ConvolutionParameter {
repeated uint32 pad = 3; // The padding size; defaults to 0
repeated uint32 kernel_size = 4; // The kernel size
repeated uint32 stride = 6; // The stride; defaults to 1
+ // Factor used to dilate the kernel, (implicitly) zero-filling the resulting
+ // holes. (Kernel dilation is sometimes referred to by its use in the
+ // algorithme à trous from Holschneider et al. 1987.)
+ repeated uint32 dilation = 18; // The dilation; defaults to 1
// For 2D convolution only, the *_h and *_w versions may also be used to
// specify both spatial dimensions.
Oops, something went wrong.