AshburnLee · AshburnLee · Oct 20, 2020 · Oct 13, 2020 · Oct 13, 2020 · Oct 13, 2020
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -48,5 +48,13 @@ repos:
         name: copyright_checker
         entry: python ./tools/codestyle/copyright.hook
         language: system
-        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py|sh)$
         exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$
+-   repo: local
+    hooks:
+    -   id: shellcheck
+        name: shellcheck
+        entry: shellcheck
+        language: system
+        files: .sh$
+        exclude: (paddle_build.sh|fast_install.sh|check_file_diff_approvals.sh)
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -240,6 +240,12 @@ if(WITH_AMD_GPU)
     include(hip)
 endif(WITH_AMD_GPU)
 
+if(WITH_DISTRIBUTE)
+    if(LINUX)
+        set(WITH_GLOO ON CACHE STRING "Enable GLOO when compiling WITH_DISTRIBUTE=ON." FORCE)
+    endif()
+endif()
+
 if(WITH_ARM)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")

diff --git a/cmake/external/gloo.cmake b/cmake/external/gloo.cmake
@@ -14,55 +14,41 @@
 
 INCLUDE(ExternalProject)
 
-execute_process(COMMAND bash -c "gcc -dumpversion" OUTPUT_VARIABLE GCC_VERSION)
-
 SET(GLOO_PROJECT       "extern_gloo")
-IF((NOT DEFINED GLOO_VER) OR (NOT DEFINED GLOO_URL))
-  MESSAGE(STATUS "use pre defined download url")
-  SET(GLOO_VER "master" CACHE STRING "" FORCE)
-  SET(GLOO_NAME "gloo" CACHE STRING "" FORCE)
-
-  if(${GCC_VERSION} VERSION_EQUAL "8.2.0")
-    SET(GLOO_URL "https://fleet.bj.bcebos.com/gloo/gloo.tar.gz.gcc8" CACHE STRING "" FORCE)
-  else()
-    SET(GLOO_URL "https://fleet.bj.bcebos.com/gloo/gloo.tar.gz.gcc482" CACHE STRING "" FORCE)
-  endif()
-ENDIF()
-
-MESSAGE(STATUS "GLOO_NAME: ${GLOO_NAME}, GLOO_URL: ${GLOO_URL}")
-SET(GLOO_SOURCE_DIR    "${THIRD_PARTY_PATH}/gloo")
-SET(GLOO_DOWNLOAD_DIR  "${GLOO_SOURCE_DIR}/src/${GLOO_PROJECT}")
-SET(GLOO_DST_DIR       "gloo")
-SET(GLOO_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
-SET(GLOO_INSTALL_DIR   ${GLOO_INSTALL_ROOT}/${GLOO_DST_DIR})
-SET(GLOO_ROOT          ${GLOO_INSTALL_DIR})
-SET(GLOO_INC_DIR       ${GLOO_ROOT}/include)
-SET(GLOO_LIB_DIR       ${GLOO_ROOT}/lib)
-SET(GLOO_LIB           ${GLOO_LIB_DIR}/libgloo.a)
-#SET(GLOO_IOMP_LIB      ${GLOO_LIB_DIR}/libiomp5.so) #todo what is this
-SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${GLOO_ROOT}/lib")
-
-INCLUDE_DIRECTORIES(${GLOO_INC_DIR})
-
-FILE(WRITE ${GLOO_DOWNLOAD_DIR}/CMakeLists.txt
-  "PROJECT(GLOO)\n"
-  "cmake_minimum_required(VERSION 3.0)\n"
-  "install(DIRECTORY ${GLOO_NAME}/include ${GLOO_NAME}/lib \n"
-  "        DESTINATION ${GLOO_DST_DIR})\n")
+SET(GLOO_PREFIX_DIR    ${THIRD_PARTY_PATH}/gloo)
+SET(GLOO_SOURCE_DIR    ${THIRD_PARTY_PATH}/gloo/src/extern_gloo/gloo)
+SET(GLOO_INSTALL_DIR   ${THIRD_PARTY_PATH}/install/gloo)
+SET(GLOO_INCLUDE_DIR   "${GLOO_INSTALL_DIR}/include" CACHE PATH "gloo include directory." FORCE)
+SET(GLOO_LIBRARY_DIR   "${GLOO_INSTALL_DIR}/lib" CACHE PATH "gloo library directory." FORCE)
+# As we add extra features for gloo, we use the non-official repo
+SET(GLOO_REPOSITORY    https://github.com/sandyhouse/gloo.git)
+SET(GLOO_TAG           v0.0.2)
+SET(GLOO_LIBRARIES     "${GLOO_INSTALL_DIR}/lib/libgloo.a" CACHE FILEPATH "gloo library." FORCE)
+
+INCLUDE_DIRECTORIES(${GLOO_INCLUDE_DIR})
+
+cache_third_party(extern_gloo
+    REPOSITORY    ${GLOO_REPOSITORY}
+    TAG           ${GLOO_TAG}
+    DIR           GLOO_SOURCE_DIR)
 
 ExternalProject_Add(
-    ${GLOO_PROJECT}
+    extern_gloo
     ${EXTERNAL_PROJECT_LOG_ARGS}
-    PREFIX                ${GLOO_SOURCE_DIR}
-    DOWNLOAD_DIR          ${GLOO_DOWNLOAD_DIR}
-    DOWNLOAD_COMMAND      wget --no-check-certificate ${GLOO_URL} -c -q -O ${GLOO_NAME}.tar.gz
-                          && tar zxvf ${GLOO_NAME}.tar.gz
-    DOWNLOAD_NO_PROGRESS  1
+    ${SHALLOW_CLONE}
+    "${GLOO_DOWNLOAD_CMD}"
+    PREFIX                "${GLOO_PREFIX_DIR}"
+    SOURCE_DIR            "${GLOO_SOURCE_DIR}"
     UPDATE_COMMAND        ""
-    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${GLOO_INSTALL_ROOT}
-    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${GLOO_INSTALL_ROOT}
+    CONFIGURE_COMMAND     ""
+    BUILD_COMMAND         mkdir -p ${GLOO_SOURCE_DIR}/build
+        && cd ${GLOO_SOURCE_DIR}/build && cmake .. && make
+        && mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo
+    INSTALL_COMMAND      ${CMAKE_COMMAND} -E copy ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR}
+    COMMAND              ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" "${GLOO_INCLUDE_DIR}/gloo"
 )
 
-ADD_LIBRARY(gloo SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET gloo PROPERTY IMPORTED_LOCATION ${GLOO_LIB})
+
+ADD_LIBRARY(gloo STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET gloo PROPERTY IMPORTED_LOCATION ${GLOO_LIBRARIES})
 ADD_DEPENDENCIES(gloo ${GLOO_PROJECT})
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
@@ -247,7 +247,8 @@ cc_library(parallel_executor SRCS parallel_executor.cc DEPS
         graph build_strategy collective_helper
         fast_threaded_ssa_graph_executor variable_helper)
 
-cc_test(dist_multi_trainer_test SRCS dist_multi_trainer_test.cc DEPS executor)
+cc_test(dist_multi_trainer_test SRCS dist_multi_trainer_test.cc DEPS
+    conditional_block_op executor)
 cc_library(prune SRCS prune.cc DEPS framework_proto boost)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
 cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry

diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <map>
 #include <memory>
 #include <mutex>  // NOLINT
+#include <set>
 #include <string>
 #include <thread>         // NOLINT
 #include <unordered_map>  // NOLINT
@@ -313,6 +314,10 @@ class DownpourWorker : public HogwildWorker {
   std::map<uint64_t, std::vector<std::string>> dense_value_names_;
   std::map<uint64_t, uint64_t> table_dependency_;
   std::vector<std::pair<uint64_t, uint64_t>> copy_dense_tables_;
+  // multitask
+  std::map<int32_t, uint64_t> cond2table_map_;
+  std::set<uint64_t> condvalue_set_;
+  bool flag_partial_push_;
 
  private:
   // std::vector<std::string> dump_param_;

diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <cstdlib>
+#include <ctime>
 #include "paddle/fluid/framework/device_worker.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 
@@ -65,6 +67,13 @@ void DownpourWorker::Initialize(const TrainerDesc& desc) {
     }
   }
 
+  flag_partial_push_ = false;
+  for (auto& m : param_.program_config(0).partial_pushdense_condtable_map()) {
+    cond2table_map_[m.key()] = m.value();
+    condvalue_set_.insert(m.value());
+    flag_partial_push_ = true;
+  }
+
   skip_ops_.resize(param_.skip_ops_size());
   for (int i = 0; i < param_.skip_ops_size(); ++i) {
     skip_ops_[i] = param_.skip_ops(i);
@@ -876,14 +885,42 @@ void DownpourWorker::TrainFiles() {
 #endif
 
     if (need_to_push_dense_) {
-      for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
-           ++i) {
-        uint64_t tid = static_cast<uint64_t>(
-            param_.program_config(0).push_dense_table_id(i));
-        fleet_ptr_->PushDenseVarsAsync(
-            *thread_scope_, tid, dense_grad_names_[tid], &push_sparse_status_,
-            scale_datanorm_, cur_batch);
+      if (flag_partial_push_) {
+        Variable* var = (*thread_scope_).FindVar("cond_tag");
+        LoDTensor* tensor = var->GetMutable<LoDTensor>();
+        // check type in python code
+        int64_t* cond_value_batch = tensor->data<int64_t>();
+
+        for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
+             ++i) {
+          uint64_t tid = static_cast<uint64_t>(
+              param_.program_config(0).push_dense_table_id(i));
+          if (condvalue_set_.find(tid) != condvalue_set_.end()) {
+            // common dense table must push dense
+            if (cond2table_map_[cond_value_batch[0]] != tid) {
+              // can't push dense
+              continue;
+            }
+          }
+
+          VLOG(3) << "push multitask dense gradient " << tid;
+          fleet_ptr_->PushDenseVarsAsync(
+              *thread_scope_, tid, dense_grad_names_[tid], &push_sparse_status_,
+              scale_datanorm_, cur_batch);
+        }
+
+      } else {
+        for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
+             ++i) {
+          uint64_t tid = static_cast<uint64_t>(
+              param_.program_config(0).push_dense_table_id(i));
+
+          fleet_ptr_->PushDenseVarsAsync(
+              *thread_scope_, tid, dense_grad_names_[tid], &push_sparse_status_,
+              scale_datanorm_, cur_batch);
+        }
       }
+
       VLOG(3) << "push dense gradient done.";
 
       // the following code should be more precise and clean

diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc
@@ -29,6 +29,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/fleet/fleet_wrapper.h"
 #include <algorithm>
 #include <utility>
+#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/framework/data_feed.h"
+#include "paddle/fluid/framework/io/fs.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/timer.h"
 
 namespace paddle {
 namespace framework {

diff --git a/paddle/fluid/framework/fleet/heter_wrapper.cc b/paddle/fluid/framework/fleet/heter_wrapper.cc
@@ -27,6 +27,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/fleet/heter_wrapper.h"
+#include <algorithm>
+#include <utility>
+#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/framework/data_feed.h"
+#include "paddle/fluid/framework/device_worker.h"
+#include "paddle/fluid/framework/io/fs.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/timer.h"
 #ifdef PADDLE_WITH_PSLIB
 
 namespace paddle {

diff --git a/paddle/fluid/framework/hetercpu_worker.cc b/paddle/fluid/framework/hetercpu_worker.cc
@@ -12,6 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/device_worker.h"
+#include "paddle/fluid/framework/device_worker_factory.h"
+#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
+#include "paddle/fluid/framework/fleet/heter_wrapper.h"
+#include "paddle/fluid/platform/cpu_helper.h"
+#include "paddle/fluid/string/string_helper.h"
+
 #ifdef PADDLE_WITH_PSLIB
 
 #if defined _WIN32 || defined __APPLE__

diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/device_worker.h"
 #include "paddle/fluid/framework/device_worker_factory.h"
+#include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/lodtensor_printer.h"
@@ -47,6 +48,8 @@ void HogwildWorker::CreateThreadOperators(const ProgramDesc &program) {
     ops_.push_back(local_op_ptr);
     continue;
   }
+  operators::PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOp(
+      program, 0, ops_);
 }
 
 void HogwildWorker::CreateThreadScope(const ProgramDesc &program) {

diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
@@ -84,19 +84,6 @@ void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const {
       VLOG(3) << "do not perform " + type() + "+bias fuse";
       return;
     }
-    if (conv->Op()->HasAttr("dilations")) {
-      auto dilations =
-          BOOST_GET_CONST(std::vector<int>, conv->Op()->GetAttr("dilations"));
-      for (const auto& d : dilations) {
-        if (d != 1) {
-          LOG(WARNING)
-              << "dilation conv not supported in MKLDNN, fuse not apply "
-              << "and set conv attribute use_mkldnn = false";
-          conv->Op()->SetAttr("use_mkldnn", false);
-          return;
-        }
-      }
-    }
 
     auto* eltwise_bias_tensor =
         scope->FindVar(eltwise_bias->Name())->GetMutable<LoDTensor>();

diff --git a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <string>
+
 #include "paddle/fluid/framework/ir/shuffle_channel_detect_pass.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 

diff --git a/paddle/fluid/framework/pull_dense_worker.cc b/paddle/fluid/framework/pull_dense_worker.cc
@@ -14,14 +14,11 @@ limitations under the License. */
 #include <time.h>
 
 #include "paddle/fluid/framework/device_worker.h"
+#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
 
 namespace paddle {
 namespace framework {
 
-class LoDTensor;
-class Scope;
-class Variable;
-
 std::shared_ptr<PullDenseWorker> PullDenseWorker::s_instance_ = NULL;
 std::mutex PullDenseWorker::mutex_for_version_;
 std::map<uint64_t, uint64_t> PullDenseWorker::last_versions_;
@@ -70,7 +67,7 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) {
 }
 
 void PullDenseWorker::CreatePinVar() {
-#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_PSLIB)
+#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
   // for (auto& v : dense_value_names_) {
   //  for (auto& name : v.second) {
   for (int i = 0; i < dwp_param_.program_config(0).pull_dense_table_id_size();

diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
@@ -13,11 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/tensor_util.h"
+
 #include <algorithm>
 #include <limits>
 #include <memory>
+#include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -81,6 +84,12 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
   }
 #endif
 #ifdef PADDLE_WITH_CUDA
+  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
+           platform::is_cuda_pinned_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place), src_ptr,
+                 size);
+  }
   else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {
     memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
@@ -282,6 +291,12 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
   }
 #endif
 #ifdef PADDLE_WITH_CUDA
+  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
+           platform::is_cuda_pinned_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place), src_ptr,
+                 size);
+  }
   else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {
     memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
@@ -943,6 +958,12 @@ void TensorFromDLPack(const ::DLTensor& dl_tensor, framework::Tensor* dst) {
 #endif
 }
 
+template <typename T>
+std::string format_tensor(const framework::Tensor& tensor) {
+  // TODO(zhiqiu): use the print option to format tensor.
+  return "NOT IMPLEMENTED";
+}
+
 template <typename T>
 std::ostream& print_tensor(std::ostream& os, const framework::Tensor& tensor) {
   auto inspect = tensor.data<T>();