Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fft based convolutional layer #544

Closed
wants to merge 9 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
26 changes: 26 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,11 @@ ifneq ($(CPU_ONLY), 1)
INCLUDE_DIRS += $(CUDA_INCLUDE_DIR)
LIBRARY_DIRS += $(CUDA_LIB_DIR)
LIBRARIES := cudart cublas curand
ifeq ($(FFT), 1)
LIBRARIES += cufft
endif
endif

LIBRARIES += glog gflags protobuf leveldb snappy \
lmdb boost_system hdf5_hl hdf5 m \
opencv_core opencv_highgui opencv_imgproc
Expand Down Expand Up @@ -303,6 +307,7 @@ ifeq ($(BLAS), mkl)
else ifeq ($(BLAS), open)
# OpenBLAS
LIBRARIES += openblas
BLAS_LIB ?= /opt/OpenBLAS/lib
else
# ATLAS
ifeq ($(LINUX), 1)
Expand All @@ -317,11 +322,32 @@ else
LDFLAGS += -framework vecLib
endif
endif

FFT ?= 0
ifeq ($(FFT), 1)
ifneq ($(BLAS), mkl)
LIBRARIES += fftw3f fftw3
endif
COMMON_FLAGS += -DUSE_FFT
endif

INCLUDE_DIRS += $(BLAS_INCLUDE)
LIBRARY_DIRS += $(BLAS_LIB)

LIBRARY_DIRS += $(LIB_BUILD_DIR)

# OpenMP
OPENMP ?= 0
ifeq ($(OPENMP), 1)
CXXFLAGS += -fopenmp
ifeq ($(BLAS), mkl)
LIBRARIES += iomp5
LIBRARY_DIRS += $(INTEL_OMP_DIR)/compiler/lib/intel64
else
LIBRARIES += fftw3_omp
endif
endif

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not use gomp?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if you use MKL for BLAS you need to use MKL version of openMP: iomp5

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ifeq ($(BLAS), mkl) 
        LIBRARIES += iomp5
else ifeq ($(FFT), fftw)
        LIBRARIES += fftw3_omp 
else
        LIBRARIES += gomp 
endif

# Complete build flags.
COMMON_FLAGS += $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir))
CXXFLAGS += -pthread -fPIC $(COMMON_FLAGS) $(WARNINGS)
Expand Down
14 changes: 10 additions & 4 deletions Makefile.config.example
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# cuDNN acceleration switch (uncomment to build with cuDNN).
# USE_CUDNN := 1

# CPU-only switch (uncomment to build without GPU support).
# CPU-only switch (uncomment to build without GPU
# CPU_ONLY := 1

# To customize your choice of compiler, uncomment and set the following.
Expand All @@ -24,8 +24,8 @@ CUDA_ARCH := -gencode arch=compute_20,code=sm_20 \
-gencode arch=compute_20,code=sm_21 \
-gencode arch=compute_30,code=sm_30 \
-gencode arch=compute_35,code=sm_35 \
#-gencode arch=compute_50,code=sm_50 \
#-gencode arch=compute_50,code=compute_50
-gencode arch=compute_50,code=sm_50 \
-gencode arch=compute_50,code=compute_50

# BLAS choice:
# atlas for ATLAS (default)
Expand All @@ -36,7 +36,7 @@ BLAS := atlas
# Leave commented to accept the defaults for your choice of BLAS
# (which should work)!
# BLAS_INCLUDE := /path/to/your/blas
# BLAS_LIB := /path/to/your/blas
# BLAS_LIB := /opt/OpenBLAS/lib

# This is required only if you will compile the matlab interface.
# MATLAB directory should contain the mex binary in /bin.
Expand Down Expand Up @@ -72,5 +72,11 @@ DISTRIBUTE_DIR := distribute
# Uncomment for debugging. Does not work on OSX due to https://github.com/BVLC/caffe/issues/171
# DEBUG := 1

# Uncomment for FFT
# FFT := 1

# Uncomment for OpenMP.
# OPENMP := 1

# The ID of the GPU that 'make runtest' will use to run unit tests.
TEST_GPUID := 0
2 changes: 1 addition & 1 deletion examples/mnist/lenet_solver.prototxt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# The train/test net protocol buffer definition
net: "examples/mnist/lenet_train_test.prototxt"
net: "examples/mnist/lenet_train_test_fft.prototxt"
# test_iter specifies how many forward passes the test should carry out.
# In the case of MNIST, we have test batch size 100 and 100 test iterations,
# covering the full 10,000 testing images.
Expand Down
149 changes: 149 additions & 0 deletions examples/mnist/lenet_train_test_fft.prototxt
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
name: "LeNet"
layers {
name: "mnist"
type: DATA
top: "data"
top: "label"
data_param {
source: "examples/mnist/mnist_train_lmdb"
backend: LMDB
batch_size: 64
}
transform_param {
scale: 0.00390625
}
include: { phase: TRAIN }
}
layers {
name: "mnist"
type: DATA
top: "data"
top: "label"
data_param {
source: "examples/mnist/mnist_test_lmdb"
backend: LMDB
batch_size: 100
}
transform_param {
scale: 0.00390625
}
include: { phase: TEST }
}

layers {
name: "conv1"
type: CONVOLUTION
bottom: "data"
top: "conv1"
blobs_lr: 1
blobs_lr: 2
convolution_param {
num_output: 20
kernel_size: 5
stride: 1
engine: FFT
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
}
}
}
layers {
name: "pool1"
type: POOLING
bottom: "conv1"
top: "pool1"
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
}
}
layers {
name: "conv2"
type: CONVOLUTION
bottom: "pool1"
top: "conv2"
blobs_lr: 1
blobs_lr: 2
convolution_param {
num_output: 50
kernel_size: 5
stride: 1
engine: FFT
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
}
}
}
layers {
name: "pool2"
type: POOLING
bottom: "conv2"
top: "pool2"
pooling_param {
pool: MAX
kernel_size: 2
stride: 2
}
}
layers {
name: "ip1"
type: INNER_PRODUCT
bottom: "pool2"
top: "ip1"
blobs_lr: 1
blobs_lr: 2
inner_product_param {
num_output: 500
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
}
}
}
layers {
name: "relu1"
type: RELU
bottom: "ip1"
top: "ip1"
}
layers {
name: "ip2"
type: INNER_PRODUCT
bottom: "ip1"
top: "ip2"
blobs_lr: 1
blobs_lr: 2
inner_product_param {
num_output: 10
weight_filler {
type: "xavier"
}
bias_filler {
type: "constant"
}
}
}
layers {
name: "accuracy"
type: ACCURACY
bottom: "ip2"
bottom: "label"
top: "accuracy"
include: { phase: TEST }
}
layers {
name: "loss"
type: SOFTMAX_LOSS
bottom: "ip2"
bottom: "label"
top: "loss"
}
18 changes: 18 additions & 0 deletions include/caffe/util/device_alternate.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#ifdef CPU_ONLY // CPU-only Caffe.


#include <vector>

// Stub out GPU calls as unavailable.
Expand Down Expand Up @@ -35,6 +36,11 @@ void classname<Dtype>::funcname##_##gpu(const vector<Blob<Dtype>*>& top, \
#include <cuda.h>
#include <cuda_runtime.h>
#include <curand.h>

#ifdef USE_FFT
#include <cufft.h>
#endif

#include <driver_types.h> // cuda driver types
#ifdef USE_CUDNN // cuDNN acceleration library.
#include "caffe/util/cudnn.hpp"
Expand Down Expand Up @@ -66,6 +72,15 @@ void classname<Dtype>::funcname##_##gpu(const vector<Blob<Dtype>*>& top, \
<< caffe::curandGetErrorString(status); \
} while (0)

#ifdef USE_FFT
#define CUFFT_CHECK(condition) \
do { \
cufftResult_t status = condition; \
CHECK_EQ(status, CUFFT_SUCCESS) << " " \
<< caffe::cufftGetErrorEnum(status); \
} while (0)
#endif

// CUDA: grid stride looping
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
Expand All @@ -80,6 +95,9 @@ namespace caffe {
// CUDA: library error reporting.
const char* cublasGetErrorString(cublasStatus_t error);
const char* curandGetErrorString(curandStatus_t error);
#ifdef USE_FFT
const char* cufftGetErrorEnum(cufftResult_t error);
#endif

// CUDA: thread number configuration.
// Use 1024 threads per block, which requires cuda sm_2x or above,
Expand Down