BVLC · borisgin · Oct 28, 2014 · Oct 28, 2014 · Nov 3, 2014 · Nov 3, 2014
diff --git a/Makefile b/Makefile
@@ -166,7 +166,11 @@ ifneq ($(CPU_ONLY), 1)
 	INCLUDE_DIRS += $(CUDA_INCLUDE_DIR)
 	LIBRARY_DIRS += $(CUDA_LIB_DIR)
 	LIBRARIES := cudart cublas curand
+	ifeq ($(FFT), 1)
+		LIBRARIES += cufft
+        endif     
 endif
+
 LIBRARIES += glog gflags protobuf leveldb snappy \
 	lmdb boost_system hdf5_hl hdf5 m \
 	opencv_core opencv_highgui opencv_imgproc
@@ -303,6 +307,7 @@ ifeq ($(BLAS), mkl)
 else ifeq ($(BLAS), open)
 	# OpenBLAS
 	LIBRARIES += openblas
+        BLAS_LIB ?= /opt/OpenBLAS/lib
 else
 	# ATLAS
 	ifeq ($(LINUX), 1)
@@ -317,11 +322,32 @@ else
 		LDFLAGS += -framework vecLib
 	endif
 endif
+
+FFT ?= 0
+ifeq ($(FFT), 1)
+        ifneq ($(BLAS), mkl)
+                LIBRARIES += fftw3f fftw3
+        endif
+        COMMON_FLAGS += -DUSE_FFT
+endif
+
 INCLUDE_DIRS += $(BLAS_INCLUDE)
 LIBRARY_DIRS += $(BLAS_LIB)
 
 LIBRARY_DIRS += $(LIB_BUILD_DIR)
 
+# OpenMP
+OPENMP ?= 0
+ifeq ($(OPENMP), 1)
+        CXXFLAGS += -fopenmp
+        ifeq ($(BLAS), mkl)
+                LIBRARIES += iomp5
+                LIBRARY_DIRS += $(INTEL_OMP_DIR)/compiler/lib/intel64
+        else
+                LIBRARIES += fftw3_omp
+        endif
+endif
+
 # Complete build flags.
 COMMON_FLAGS += $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir))
 CXXFLAGS += -pthread -fPIC $(COMMON_FLAGS) $(WARNINGS)

diff --git a/Makefile.config.example b/Makefile.config.example
@@ -4,7 +4,7 @@
 # cuDNN acceleration switch (uncomment to build with cuDNN).
 # USE_CUDNN := 1
 
-# CPU-only switch (uncomment to build without GPU support).
+# CPU-only switch (uncomment to build without GPU 
 # CPU_ONLY := 1
 
 # To customize your choice of compiler, uncomment and set the following.
@@ -24,8 +24,8 @@ CUDA_ARCH := -gencode arch=compute_20,code=sm_20 \
 		-gencode arch=compute_20,code=sm_21 \
 		-gencode arch=compute_30,code=sm_30 \
 		-gencode arch=compute_35,code=sm_35 \
-		#-gencode arch=compute_50,code=sm_50 \
-		#-gencode arch=compute_50,code=compute_50
+		-gencode arch=compute_50,code=sm_50 \
+		-gencode arch=compute_50,code=compute_50
 
 # BLAS choice:
 # atlas for ATLAS (default)
@@ -36,7 +36,7 @@ BLAS := atlas
 # Leave commented to accept the defaults for your choice of BLAS
 # (which should work)!
 # BLAS_INCLUDE := /path/to/your/blas
-# BLAS_LIB := /path/to/your/blas
+# BLAS_LIB := /opt/OpenBLAS/lib
 
 # This is required only if you will compile the matlab interface.
 # MATLAB directory should contain the mex binary in /bin.
@@ -72,5 +72,11 @@ DISTRIBUTE_DIR := distribute
 # Uncomment for debugging. Does not work on OSX due to https://github.com/BVLC/caffe/issues/171
 # DEBUG := 1
 
+# Uncomment for FFT
+# FFT := 1
+
+# Uncomment for OpenMP.
+# OPENMP := 1
+
 # The ID of the GPU that 'make runtest' will use to run unit tests.
 TEST_GPUID := 0
diff --git a/examples/mnist/lenet_solver.prototxt b/examples/mnist/lenet_solver.prototxt
@@ -1,5 +1,5 @@
 # The train/test net protocol buffer definition
-net: "examples/mnist/lenet_train_test.prototxt"
+net: "examples/mnist/lenet_train_test_fft.prototxt"
 # test_iter specifies how many forward passes the test should carry out.
 # In the case of MNIST, we have test batch size 100 and 100 test iterations,
 # covering the full 10,000 testing images.

diff --git a/examples/mnist/lenet_train_test_fft.prototxt b/examples/mnist/lenet_train_test_fft.prototxt
@@ -0,0 +1,149 @@
+name: "LeNet"
+layers {
+  name: "mnist"
+  type: DATA
+  top: "data"
+  top: "label"
+  data_param {
+    source: "examples/mnist/mnist_train_lmdb"
+    backend: LMDB
+    batch_size: 64
+  }
+  transform_param {
+    scale: 0.00390625
+  }
+  include: { phase: TRAIN }
+}
+layers {
+  name: "mnist"
+  type: DATA
+  top: "data"
+  top: "label"
+  data_param {
+    source: "examples/mnist/mnist_test_lmdb"
+    backend: LMDB
+    batch_size: 100
+  }
+  transform_param {
+    scale: 0.00390625
+  }
+  include: { phase: TEST }
+}
+
+layers {
+  name: "conv1"
+  type: CONVOLUTION
+  bottom: "data"
+  top: "conv1"
+  blobs_lr: 1
+  blobs_lr: 2
+  convolution_param {
+    num_output: 20
+    kernel_size: 5
+    stride: 1
+    engine: FFT
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layers {
+  name: "pool1"
+  type: POOLING
+  bottom: "conv1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layers {
+  name: "conv2"
+  type: CONVOLUTION
+  bottom: "pool1"
+  top: "conv2"
+  blobs_lr: 1
+  blobs_lr: 2
+  convolution_param {
+    num_output: 50
+    kernel_size: 5
+    stride: 1
+    engine: FFT
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layers {
+  name: "pool2"
+  type: POOLING
+  bottom: "conv2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layers {
+  name: "ip1"
+  type: INNER_PRODUCT
+  bottom: "pool2"
+  top: "ip1"
+  blobs_lr: 1
+  blobs_lr: 2
+  inner_product_param {
+    num_output: 500
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layers {
+  name: "relu1"
+  type: RELU
+  bottom: "ip1"
+  top: "ip1"
+}
+layers {
+  name: "ip2"
+  type: INNER_PRODUCT
+  bottom: "ip1"
+  top: "ip2"
+  blobs_lr: 1
+  blobs_lr: 2
+  inner_product_param {
+    num_output: 10
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layers {
+  name: "accuracy"
+  type: ACCURACY
+  bottom: "ip2"
+  bottom: "label"
+  top: "accuracy"
+  include: { phase: TEST }
+}
+layers {
+  name: "loss"
+  type: SOFTMAX_LOSS
+  bottom: "ip2"
+  bottom: "label"
+  top: "loss"
+}
diff --git a/include/caffe/util/device_alternate.hpp b/include/caffe/util/device_alternate.hpp
@@ -3,6 +3,7 @@
 
 #ifdef CPU_ONLY  // CPU-only Caffe.
 
+
 #include <vector>
 
 // Stub out GPU calls as unavailable.
@@ -35,6 +36,11 @@ void classname<Dtype>::funcname##_##gpu(const vector<Blob<Dtype>*>& top, \
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <curand.h>
+
+#ifdef USE_FFT
+#include <cufft.h>
+#endif
+
 #include <driver_types.h>  // cuda driver types
 #ifdef USE_CUDNN  // cuDNN acceleration library.
 #include "caffe/util/cudnn.hpp"
@@ -66,6 +72,15 @@ void classname<Dtype>::funcname##_##gpu(const vector<Blob<Dtype>*>& top, \
       << caffe::curandGetErrorString(status); \
   } while (0)
 
+#ifdef USE_FFT
+#define CUFFT_CHECK(condition) \
+  do { \
+    cufftResult_t status = condition; \
+    CHECK_EQ(status, CUFFT_SUCCESS) << " " \
+      << caffe::cufftGetErrorEnum(status); \
+  } while (0)
+#endif
+
 // CUDA: grid stride looping
 #define CUDA_KERNEL_LOOP(i, n) \
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
@@ -80,6 +95,9 @@ namespace caffe {
 // CUDA: library error reporting.
 const char* cublasGetErrorString(cublasStatus_t error);
 const char* curandGetErrorString(curandStatus_t error);
+#ifdef USE_FFT
+const char* cufftGetErrorEnum(cufftResult_t error);
+#endif
 
 // CUDA: thread number configuration.
 // Use 1024 threads per block, which requires cuda sm_2x or above,