/
convolution.hpp
343 lines (293 loc) · 12.8 KB
/
convolution.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
// Copyright 2016, Tobias Hermann.
// https://github.com/Dobiasd/frugally-deep
// Distributed under the MIT License.
// (See accompanying LICENSE file or at
// https://opensource.org/licenses/MIT)
#pragma once
#include "fdeep/common.hpp"
#include "fdeep/filter.hpp"
#include <algorithm>
#include <cassert>
#include <cstddef>
#include <vector>
namespace fdeep { namespace internal
{
struct convolution_filter_matrices
{
tensor_shape filter_shape_;
std::size_t filter_count_;
float_vec biases_;
bool use_bias_;
tensor filter_mats_;
};
inline convolution_filter_matrices generate_im2col_filter_matrix(
const std::vector<filter>& filters)
{
assertion(fplus::all_the_same_on(
fplus_c_mem_fn_t(filter, shape, tensor_shape), filters),
"all filters must have the same shape");
const auto biases = fplus::transform_convert<float_vec>(
fplus_c_mem_fn_t(filter, get_bias, float_type),
filters);
const bool use_bias =
fplus::sum(biases) != static_cast<float_type>(0) ||
!fplus::all_the_same(biases);
const auto shape = filters.front().shape();
tensor filter_mats = tensor(
tensor_shape(shape.height_, shape.width_, shape.depth_, filters.size()),
static_cast<float_type>(0));
for (std::size_t y = 0; y < shape.height_; ++y)
{
for (std::size_t n = 0; n < filters.size(); ++n)
{
for (std::size_t x = 0; x < shape.width_; ++x)
{
for (std::size_t z = 0; z < shape.depth_; ++z)
{
filter_mats.set(tensor_pos(y, x, z, n),
filters[n].get(tensor_pos(y, x, z)));
}
}
}
}
return {shape, filters.size(), biases, use_bias, filter_mats};
}
inline convolution_filter_matrices generate_im2col_single_filter_matrix(
const filter& filter)
{
return generate_im2col_filter_matrix(filter_vec(1, filter));
}
inline tensor init_conv_output_tensor(
std::size_t out_height,
std::size_t out_width,
std::size_t out_depth,
std::size_t rank,
const convolution_filter_matrices& filter_mat)
{
tensor output(tensor_shape_with_changed_rank(
tensor_shape(out_height, out_width, out_depth),
rank),
static_cast<float_type>(0));
if (filter_mat.use_bias_) {
const auto bias_ptr = &filter_mat.biases_.front();
const auto bias_ptr_end = bias_ptr + out_depth;
for (std::size_t y_out = 0; y_out < out_height; ++y_out)
{
for (std::size_t x_out = 0; x_out < out_width; ++x_out)
{
auto output_ptr = &output.get_ref_ignore_rank(tensor_pos(0, 0, y_out, x_out, 0));
std::copy(bias_ptr, bias_ptr_end, output_ptr);
}
}
}
return output;
}
inline Eigen::Map<ColMajorMatrixXf, Eigen::Unaligned, Eigen::OuterStride<>> get_im2col_mapping(
const tensor& in,
std::size_t f_width,
std::size_t f_depth,
std::size_t strides_x,
std::size_t out_width,
std::size_t y,
std::size_t y_filt)
{
// To avoid using too much RAM, the input tensor is not materializezd
// as an actual im2col matrix, but instead the too-small outer stride
// of the matrix mapping is utilized to achieve the overlap the receptive fields.
return Eigen::Map<ColMajorMatrixXf, Eigen::Unaligned, Eigen::OuterStride<>>(
const_cast<float_type*>(&in.get_ref_ignore_rank(tensor_pos(0, 0, y + y_filt, 0, 0))),
static_cast<EigenIndex>(f_width * f_depth),
static_cast<EigenIndex>(out_width),
Eigen::OuterStride<>(static_cast<EigenIndex>(f_depth * strides_x)));
}
// Special version for convolution with strides_x == 1 and strides_y == 1.
// Reduces the forward-pass runtime of VGG19 about 15%, by using fewer but larger GEMMs.
inline tensor convolve_accumulative_s1x1(
std::size_t out_height,
std::size_t out_width,
const convolution_filter_matrices& filter_mat,
const tensor& in)
{
const tensor& filter_mats = filter_mat.filter_mats_;
const auto f_height = filter_mat.filter_shape_.height_;
const auto f_width = filter_mat.filter_shape_.width_;
const auto f_depth = filter_mat.filter_shape_.depth_;
const auto out_depth = filter_mat.filter_count_;
assertion(f_depth == in.shape().depth_, "filter depth does not match input");
assertion(filter_mats.shape().size_dim_4_ == f_height, "incorrect number of filter levels in y direction");
assertion(out_width == (in.shape().width_ - f_width) + 1, "output width does not match");
assertion(out_depth == filter_mat.biases_.size(), "invlid bias count");
tensor output = init_conv_output_tensor(out_height, out_width, out_depth, in.shape().rank(), filter_mat);
const std::size_t out_width_temp = out_width + f_width - 1;
tensor output_temp(tensor_shape_with_changed_rank(
tensor_shape(out_height, out_width_temp, out_depth),
in.shape().rank()),
static_cast<float_type>(0));
const auto mapping_width = out_width_temp * (out_height - 1) + out_width;
for (std::size_t y_filt = 0; y_filt < f_height; ++y_filt)
{
const Eigen::Map<ColMajorMatrixXf, Eigen::Unaligned>
filter(const_cast<float_type*>(&filter_mats.get_ref_ignore_rank(tensor_pos(0, y_filt, 0, 0, 0))),
static_cast<EigenIndex>(out_depth),
static_cast<EigenIndex>(f_width * f_depth));
const auto input = get_im2col_mapping(in, f_width, f_depth, 1, mapping_width, 0, y_filt);
Eigen::Map<Eigen::Matrix<float_type, Eigen::Dynamic, Eigen::Dynamic>, Eigen::Unaligned>
output_temp_map(&output_temp.get_ref_ignore_rank(tensor_pos(0, 0, 0, 0, 0)),
static_cast<EigenIndex>(out_depth),
static_cast<EigenIndex>(mapping_width));
output_temp_map.noalias() += filter * input;
}
// Dropping the superfluous results from "between" the rows.
for (std::size_t y_out = 0; y_out < out_height; ++y_out)
{
for (std::size_t x_out = 0; x_out < out_width; ++x_out)
{
for (std::size_t z_out = 0; z_out < out_depth; ++z_out)
{
output.get_ref_ignore_rank(tensor_pos(0, 0, y_out, x_out, z_out)) +=
output_temp.get_ref_ignore_rank(tensor_pos(0, 0, y_out, x_out, z_out));
}
}
}
return output;
}
inline tensor convolve_accumulative(
std::size_t out_height,
std::size_t out_width,
std::size_t strides_y,
std::size_t strides_x,
const convolution_filter_matrices& filter_mat,
const tensor& in)
{
// Using the im2col method, the convolution is expressed as GEMMs for performance.
// https://stackoverflow.com/questions/16798888/2-d-convolution-as-a-matrix-matrix-multiplication
// https://github.com/tensorflow/tensorflow/blob/a0d784bdd31b27e013a7eac58a86ba62e86db299/tensorflow/core/kernels/conv_ops_using_gemm.cc
// http://www.youtube.com/watch?v=pA4BsUK3oP4&t=36m22s
const tensor& filter_mats = filter_mat.filter_mats_;
const auto f_height = filter_mat.filter_shape_.height_;
const auto f_width = filter_mat.filter_shape_.width_;
const auto f_depth = filter_mat.filter_shape_.depth_;
const auto out_depth = filter_mat.filter_count_;
assertion(f_depth == in.shape().depth_, "filter depth does not match input");
assertion(filter_mats.shape().size_dim_4_ == f_height, "incorrect number of filter levels in y direction");
assertion(out_width == (in.shape().width_ - f_width) / strides_x + 1, "output width does not match");
assertion(out_depth == filter_mat.biases_.size(), "invlid bias count");
if (strides_x == 1 && strides_y == 1)
{
return convolve_accumulative_s1x1(out_height, out_width, filter_mat, in);
}
tensor output = init_conv_output_tensor(out_height, out_width, out_depth, in.shape().rank(), filter_mat);
for (std::size_t y_filt = 0; y_filt < f_height; ++y_filt)
{
const Eigen::Map<ColMajorMatrixXf, Eigen::Unaligned>
filter(const_cast<float_type*>(&filter_mats.get_ref_ignore_rank(tensor_pos(0, y_filt, 0, 0, 0))),
static_cast<EigenIndex>(out_depth),
static_cast<EigenIndex>(f_width * f_depth));
for (std::size_t y = 0, y_out = 0; y < in.shape().height_ + 1 - f_height; y += strides_y, ++y_out)
{
const auto input = get_im2col_mapping(in, f_width, f_depth, strides_x, out_width, y, y_filt);
Eigen::Map<ColMajorMatrixXf, Eigen::Unaligned>
output_map(&output.get_ref_ignore_rank(tensor_pos(0, 0, y_out, 0, 0)),
static_cast<EigenIndex>(out_depth),
static_cast<EigenIndex>(out_width));
output_map.noalias() += filter * input;
}
}
return output;
}
enum class padding { valid, same, causal };
struct convolution_config
{
std::size_t pad_top_;
std::size_t pad_bottom_;
std::size_t pad_left_;
std::size_t pad_right_;
std::size_t out_height_;
std::size_t out_width_;
};
inline convolution_config preprocess_convolution(
const shape2& filter_shape,
const shape2& strides,
padding pad_type,
std::size_t input_shape_height,
std::size_t input_shape_width)
{
// https://www.tensorflow.org/api_guides/python/nn#Convolution
const int filter_height = static_cast<int>(filter_shape.height_);
const int filter_width = static_cast<int>(filter_shape.width_);
const int in_height = static_cast<int>(input_shape_height);
const int in_width = static_cast<int>(input_shape_width);
const int strides_y = static_cast<int>(strides.height_);
const int strides_x = static_cast<int>(strides.width_);
int out_height = 0;
int out_width = 0;
if (pad_type == padding::same || pad_type == padding::causal)
{
out_height = fplus::ceil(static_cast<float>(in_height) / static_cast<float>(strides_y) - 0.001);
out_width = fplus::ceil(static_cast<float>(in_width) / static_cast<float>(strides_x) - 0.001);
}
else
{
out_height = fplus::ceil(static_cast<float>(in_height - filter_height + 1) / static_cast<float>(strides_y) - 0.001);
out_width = fplus::ceil(static_cast<float>(in_width - filter_width + 1) / static_cast<float>(strides_x) - 0.001);
}
int pad_top = 0;
int pad_bottom = 0;
int pad_left = 0;
int pad_right = 0;
if (pad_type == padding::same)
{
int pad_along_height = 0;
int pad_along_width = 0;
if (in_height % strides_y == 0)
pad_along_height = std::max(filter_height - strides_y, 0);
else
pad_along_height = std::max(filter_height - (in_height % strides_y), 0);
if (in_width % strides_x == 0)
pad_along_width = std::max(filter_width - strides_x, 0);
else
pad_along_width = std::max(filter_width - (in_width % strides_x), 0);
pad_top = pad_along_height / 2;
pad_bottom = pad_along_height - pad_top;
pad_left = pad_along_width / 2;
pad_right = pad_along_width - pad_left;
}
else if (pad_type == padding::causal)
{
pad_top = filter_height - 1;
pad_left = filter_width - 1;
}
std::size_t out_height_size_t = fplus::integral_cast_throw<std::size_t>(out_height);
std::size_t out_width_size_t = fplus::integral_cast_throw<std::size_t>(out_width);
std::size_t pad_top_size_t = fplus::integral_cast_throw<std::size_t>(pad_top);
std::size_t pad_bottom_size_t = fplus::integral_cast_throw<std::size_t>(pad_bottom);
std::size_t pad_left_size_t = fplus::integral_cast_throw<std::size_t>(pad_left);
std::size_t pad_right_size_t = fplus::integral_cast_throw<std::size_t>(pad_right);
return {pad_top_size_t, pad_bottom_size_t,
pad_left_size_t, pad_right_size_t,
out_height_size_t, out_width_size_t};
}
inline tensor convolve(
const shape2& strides,
const padding& pad_type,
const convolution_filter_matrices& filter_mat,
const tensor& input)
{
assertion(filter_mat.filter_shape_.depth_ == input.shape().depth_,
"invalid filter depth");
const auto conv_cfg = preprocess_convolution(
filter_mat.filter_shape_.without_depth(),
strides, pad_type, input.shape().height_, input.shape().width_);
// The padding step usually (on a VGG19 net) only takes about 1% of the overall runtime.
// So the increased code complexity of doing it inside the convolution step
// is probably not worth the small potential performance gain.
const auto in_padded = pad_tensor(0,
conv_cfg.pad_top_, conv_cfg.pad_bottom_, conv_cfg.pad_left_, conv_cfg.pad_right_,
input);
return convolve_accumulative(
conv_cfg.out_height_, conv_cfg.out_width_,
strides.height_, strides.width_,
filter_mat,
in_padded);
}
} } // namespace fdeep, namespace internal