|
|
@@ -21,21 +21,57 @@ void CuDNNConvolutionLayer<Dtype>::Forward_gpu( |
|
|
|
|
|
// Forward through cuDNN in parallel over groups.
|
|
|
for (int g = 0; g < this->group_; g++) {
|
|
|
+ cudnnConvolutionFwdAlgo_t algo;
|
|
|
+
|
|
|
+ // pick the convolution algorithm
|
|
|
+ // TODO(shelhamer) this should be done during reshape
|
|
|
+ // TODO(shelhamer) the choice of automatic or manual algorithm picking
|
|
|
+ // should be exposed in proto
|
|
|
+ CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(handle_[g],
|
|
|
+ bottom_descs_[i],
|
|
|
+ filter_desc_,
|
|
|
+ conv_descs_[i],
|
|
|
+ top_descs_[i],
|
|
|
+ CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
|
|
|
+ 0, // memoryLimitInBytes,
|
|
|
+ &algo));
|
|
|
+
|
|
|
+ // get minimum size of the workspace needed for the desired algorithm
|
|
|
+ size_t workspaceSizeInBytes_temp = 0;
|
|
|
+
|
|
|
+ CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(handle_[g],
|
|
|
+ bottom_descs_[i],
|
|
|
+ filter_desc_,
|
|
|
+ conv_descs_[i],
|
|
|
+ top_descs_[i],
|
|
|
+ algo,
|
|
|
+ &workspaceSizeInBytes));
|
|
|
+
|
|
|
+ if (workspaceSizeInBytes_temp > workspaceSizeInBytes) {
|
|
|
+ workspaceSizeInBytes = workspaceSizeInBytes_temp;
|
|
|
+ // free the existing workspace and allocate a new (larger) one
|
|
|
+ cudaFree(this->workspace);
|
|
|
+ cudaMalloc(&(this->workspace), workspaceSizeInBytes);
|
|
|
+ }
|
|
|
+
|
|
|
// Filters.
|
|
|
CUDNN_CHECK(cudnnConvolutionForward(handle_[g],
|
|
|
- bottom_descs_[i], bottom_data + bottom_offset_ * g,
|
|
|
- filter_desc_, weight + weight_offset_ * g,
|
|
|
- conv_descs_[i],
|
|
|
- top_descs_[i], top_data + top_offset_ * g,
|
|
|
- CUDNN_RESULT_NO_ACCUMULATE));
|
|
|
+ cudnn::dataType<Dtype>::one,
|
|
|
+ bottom_descs_[i], bottom_data + bottom_offset_ * g,
|
|
|
+ filter_desc_, weight + weight_offset_ * g,
|
|
|
+ conv_descs_[i],
|
|
|
+ algo, workspace, workspaceSizeInBytes,
|
|
|
+ cudnn::dataType<Dtype>::zero,
|
|
|
+ top_descs_[i], top_data + top_offset_ * g));
|
|
|
|
|
|
// Bias.
|
|
|
if (this->bias_term_) {
|
|
|
const Dtype* bias_data = this->blobs_[1]->gpu_data();
|
|
|
- Dtype alpha = 1.;
|
|
|
- CUDNN_CHECK(cudnnAddTensor4d(handle_[g], CUDNN_ADD_SAME_C, &alpha,
|
|
|
- bias_desc_, bias_data + bias_offset_ * g,
|
|
|
- top_descs_[i], top_data + top_offset_ * g));
|
|
|
+ CUDNN_CHECK(cudnnAddTensor(handle_[g], CUDNN_ADD_SAME_C,
|
|
|
+ cudnn::dataType<Dtype>::one,
|
|
|
+ bias_desc_, bias_data + bias_offset_ * g,
|
|
|
+ cudnn::dataType<Dtype>::one,
|
|
|
+ top_descs_[i], top_data + top_offset_ * g));
|
|
|
}
|
|
|
}
|
|
|
|
|
|
@@ -68,20 +104,22 @@ void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top, |
|
|
// Gradient w.r.t. bias.
|
|
|
if (this->bias_term_ && this->param_propagate_down_[1]) {
|
|
|
CUDNN_CHECK(cudnnConvolutionBackwardBias(handle_[0*this->group_ + g],
|
|
|
- top_descs_[i], top_diff + top_offset_ * g,
|
|
|
- bias_desc_, bias_diff + bias_offset_ * g,
|
|
|
- CUDNN_RESULT_ACCUMULATE));
|
|
|
+ cudnn::dataType<Dtype>::one,
|
|
|
+ top_descs_[i], top_diff + top_offset_ * g,
|
|
|
+ cudnn::dataType<Dtype>::one,
|
|
|
+ bias_desc_, bias_diff + bias_offset_ * g));
|
|
|
}
|
|
|
|
|
|
// Gradient w.r.t. weights.
|
|
|
if (this->param_propagate_down_[0]) {
|
|
|
const Dtype* bottom_data = bottom[i]->gpu_data();
|
|
|
CUDNN_CHECK(cudnnConvolutionBackwardFilter(handle_[1*this->group_ + g],
|
|
|
- bottom_descs_[i], bottom_data + bottom_offset_ * g,
|
|
|
- top_descs_[i], top_diff + top_offset_ * g,
|
|
|
- conv_descs_[i],
|
|
|
- filter_desc_, weight_diff + weight_offset_ * g,
|
|
|
- CUDNN_RESULT_ACCUMULATE));
|
|
|
+ cudnn::dataType<Dtype>::one,
|
|
|
+ bottom_descs_[i], bottom_data + bottom_offset_ * g,
|
|
|
+ top_descs_[i], top_diff + top_offset_ * g,
|
|
|
+ conv_descs_[i],
|
|
|
+ cudnn::dataType<Dtype>::one,
|
|
|
+ filter_desc_, weight_diff + weight_offset_ * g));
|
|
|
}
|
|
|
|
|
|
// Gradient w.r.t. bottom data.
|
|
|
@@ -91,11 +129,12 @@ void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top, |
|
|
}
|
|
|
Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
|
|
|
CUDNN_CHECK(cudnnConvolutionBackwardData(handle_[2*this->group_ + g],
|
|
|
- filter_desc_, weight + weight_offset_ * g,
|
|
|
- top_descs_[i], top_diff + top_offset_ * g,
|
|
|
- conv_descs_[i],
|
|
|
- bottom_descs_[i], bottom_diff + bottom_offset_ * g,
|
|
|
- CUDNN_RESULT_NO_ACCUMULATE));
|
|
|
+ cudnn::dataType<Dtype>::one,
|
|
|
+ filter_desc_, weight + weight_offset_ * g,
|
|
|
+ top_descs_[i], top_diff + top_offset_ * g,
|
|
|
+ conv_descs_[i],
|
|
|
+ cudnn::dataType<Dtype>::zero,
|
|
|
+ bottom_descs_[i], bottom_diff + bottom_offset_ * g));
|
|
|
}
|
|
|
}
|
|
|
|
|
|
|