Merge NVIDIA's NCCL multi-GPU, switch it to python #4563
Merged
Commits
Show all changes
5 commits
Select commit
Hold shift + click to select a range
2317fa1
Logging from python, e.g. for lower log level on multi-GPU workers
cypof 3ba2054
Switched multi-GPU to NCCL
cypof e21b420
Python Multi-GPU
cypof 0d27efc
Python layers should build on multiprocess & solver_cnt; enable with …
mglaeser 5f28eb1
Using default from proto for prefetch
cypof
Jump to file or symbol
Failed to load files and symbols.
| @@ -0,0 +1,26 @@ | ||
| +set(NCCL_INC_PATHS | ||
| + /usr/include | ||
| + /usr/local/include | ||
| + $ENV{NCCL_DIR}/include | ||
| + ) | ||
| + | ||
| +set(NCCL_LIB_PATHS | ||
| + /lib | ||
| + /lib64 | ||
| + /usr/lib | ||
| + /usr/lib64 | ||
| + /usr/local/lib | ||
| + /usr/local/lib64 | ||
| + $ENV{NCCL_DIR}/lib | ||
| + ) | ||
| + | ||
| +find_path(NCCL_INCLUDE_DIR NAMES nccl.h PATHS ${NCCL_INC_PATHS}) | ||
| +find_library(NCCL_LIBRARIES NAMES nccl PATHS ${NCCL_LIB_PATHS}) | ||
| + | ||
| +include(FindPackageHandleStandardArgs) | ||
| +find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIR NCCL_LIBRARIES) | ||
| + | ||
| +if (NCCL_FOUND) | ||
| + message(STATUS "Found NCCL (include: ${NCCL_INCLUDE_DIR}, library: ${NCCL_LIBRARIES})") | ||
| + mark_as_advanced(NCCL_INCLUDE_DIR NCCL_LIBRARIES) | ||
| +endif () |
| @@ -1,82 +0,0 @@ | ||
| -#ifndef CAFFE_DATA_READER_HPP_ | ||
| -#define CAFFE_DATA_READER_HPP_ | ||
| - | ||
| -#include <map> | ||
| -#include <string> | ||
| -#include <vector> | ||
| - | ||
| -#include "caffe/common.hpp" | ||
| -#include "caffe/internal_thread.hpp" | ||
| -#include "caffe/util/blocking_queue.hpp" | ||
| -#include "caffe/util/db.hpp" | ||
| - | ||
| -namespace caffe { | ||
| - | ||
| -/** | ||
| - * @brief Reads data from a source to queues available to data layers. | ||
| - * A single reading thread is created per source, even if multiple solvers | ||
| - * are running in parallel, e.g. for multi-GPU training. This makes sure | ||
| - * databases are read sequentially, and that each solver accesses a different | ||
| - * subset of the database. Data is distributed to solvers in a round-robin | ||
| - * way to keep parallel training deterministic. | ||
| - */ | ||
| -class DataReader { | ||
| - public: | ||
| - explicit DataReader(const LayerParameter& param); | ||
| - ~DataReader(); | ||
| - | ||
| - inline BlockingQueue<Datum*>& free() const { | ||
| - return queue_pair_->free_; | ||
| - } | ||
| - inline BlockingQueue<Datum*>& full() const { | ||
| - return queue_pair_->full_; | ||
| - } | ||
| - | ||
| - protected: | ||
| - // Queue pairs are shared between a body and its readers | ||
| - class QueuePair { | ||
| - public: | ||
| - explicit QueuePair(int size); | ||
| - ~QueuePair(); | ||
| - | ||
| - BlockingQueue<Datum*> free_; | ||
| - BlockingQueue<Datum*> full_; | ||
| - | ||
| - DISABLE_COPY_AND_ASSIGN(QueuePair); | ||
| - }; | ||
| - | ||
| - // A single body is created per source | ||
| - class Body : public InternalThread { | ||
| - public: | ||
| - explicit Body(const LayerParameter& param); | ||
| - virtual ~Body(); | ||
| - | ||
| - protected: | ||
| - void InternalThreadEntry(); | ||
| - void read_one(db::Cursor* cursor, QueuePair* qp); | ||
| - | ||
| - const LayerParameter param_; | ||
| - BlockingQueue<shared_ptr<QueuePair> > new_queue_pairs_; | ||
| - | ||
| - friend class DataReader; | ||
| - | ||
| - DISABLE_COPY_AND_ASSIGN(Body); | ||
| - }; | ||
| - | ||
| - // A source is uniquely identified by its layer name + path, in case | ||
| - // the same database is read from two different locations in the net. | ||
| - static inline string source_key(const LayerParameter& param) { | ||
| - return param.name() + ":" + param.data_param().source(); | ||
| - } | ||
| - | ||
| - const shared_ptr<QueuePair> queue_pair_; | ||
| - shared_ptr<Body> body_; | ||
| - | ||
| - static map<const string, boost::weak_ptr<DataReader::Body> > bodies_; | ||
| - | ||
| -DISABLE_COPY_AND_ASSIGN(DataReader); | ||
| -}; | ||
| - | ||
| -} // namespace caffe | ||
| - | ||
| -#endif // CAFFE_DATA_READER_HPP_ |
Oops, something went wrong.