add xpu lars_momentum/pow2_decay (PaddlePaddle#44448)

*test=kunlun
Aurelius84 · Jul 29, 2022 · 58b599c · 58b599c
1 parent cf627bc
commit 58b599c
Show file tree

Hide file tree

Showing 7 changed files with 432 additions and 2 deletions.
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
@@ -10,7 +10,7 @@ set(XPU_RT_LIB_NAME "libxpurt.so")
 if(NOT DEFINED XPU_BASE_URL)
   set(XPU_BASE_URL_WITHOUT_DATE
       "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-  set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220718")
+  set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220719")
 else()
   set(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()
@@ -19,7 +19,7 @@ endif()
 if(NOT DEFINED XPU_XDNN_BASE_URL)
   set(XPU_XDNN_BASE_URL_WITHOUT_DATE
       "https://klx-sdk-release-public.su.bcebos.com/xdnn/dev")
-  set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220718")
+  set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220719")
 else()
   set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL}")
 endif()

diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op_xpu.cc b/paddle/fluid/operators/optimizers/lars_momentum_op_xpu.cc
@@ -0,0 +1,115 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/optimizers/lars_momentum_op.h"
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class LarsMomentumOpXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    bool multi_precision = ctx.Attr<bool>("multi_precision");
+    auto param_out = ctx.MultiOutput<framework::LoDTensor>("ParamOut");
+    auto velocity_out = ctx.MultiOutput<framework::LoDTensor>("VelocityOut");
+    auto param = ctx.MultiInput<framework::LoDTensor>("Param");
+    auto velocity = ctx.MultiInput<framework::LoDTensor>("Velocity");
+    auto learning_rate = ctx.MultiInput<framework::LoDTensor>("LearningRate");
+    auto grad = ctx.MultiInput<framework::LoDTensor>("Grad");
+    auto weight_decay_arr = ctx.Attr<std::vector<float>>("lars_weight_decay");
+    auto master_param = ctx.MultiInput<framework::LoDTensor>("MasterParam");
+    auto master_param_out =
+        ctx.MultiOutput<framework::LoDTensor>("MasterParamOut");
+    T mu = static_cast<T>(ctx.Attr<float>("mu"));
+    T lars_coeff = ctx.Attr<float>("lars_coeff");
+    T epsilon = ctx.Attr<float>("epsilon");
+    T rescale_grad = ctx.Attr<float>("rescale_grad");
+
+    std::vector<T*> param_list;
+    std::vector<T*> grad_list;
+    std::vector<T*> param_out_list;
+    std::vector<float*> velocity_list;
+    std::vector<float*> velocity_out_list;
+    std::vector<float*> lrs;
+    std::vector<int> param_sizes;
+
+    std::vector<float*> master_param_list;
+    std::vector<float*> master_param_out_list;
+    int op_num = param.size();
+    for (int i = 0; i < op_num; ++i) {
+      param_list.push_back(const_cast<T*>(param[i]->data<T>()));
+      grad_list.push_back(const_cast<T*>(grad[i]->data<T>()));
+      param_out_list.push_back(param_out[i]->mutable_data<T>(ctx.GetPlace()));
+      velocity_list.push_back(const_cast<float*>(velocity[i]->data<float>()));
+      velocity_out_list.push_back(
+          velocity_out[i]->mutable_data<float>(ctx.GetPlace()));
+      lrs.push_back(const_cast<float*>(learning_rate[i]->data<float>()));
+      param_sizes.push_back(param[i]->numel());
+
+      PADDLE_ENFORCE_EQ(
+          param_list[i],
+          param_out_list[i],
+          platform::errors::InvalidArgument(
+              "Input(Param) and Output(ParamOut) must be the same Tensors."));
+      PADDLE_ENFORCE_EQ(velocity_list[i],
+                        velocity_out_list[i],
+                        platform::errors::InvalidArgument(
+                            "Input(Velocity) and Output(VelocityOut) must be "
+                            "the same Tensors."));
+      if (multi_precision) {
+        master_param_list.push_back(
+            const_cast<float*>(master_param[i]->data<float>()));
+        master_param_out_list.push_back(
+            master_param_out[i]->mutable_data<float>(ctx.GetPlace()));
+        PADDLE_ENFORCE_EQ(master_param_list[i],
+                          master_param_out_list[i],
+                          platform::errors::InvalidArgument(
+                              "Input(MasterParam) and Output(MasterParamOut) "
+                              "must be the same Tensors."));
+      } else {
+        master_param_list.push_back(nullptr);
+        master_param_out_list.push_back(nullptr);
+      }
+    }
+
+    auto& dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
+    int r = lars_momentum(dev_ctx.x_context(),
+                          param_list,
+                          grad_list,
+                          velocity_list,
+                          lrs,
+                          master_param_list,
+                          param_out_list,
+                          velocity_out_list,
+                          master_param_out_list,
+                          weight_decay_arr,
+                          param_sizes,
+                          mu,
+                          lars_coeff,
+                          epsilon,
+                          rescale_grad);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "lars_momentum");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(lars_momentum, ops::LarsMomentumOpXPUKernel<float>);
+#endif
diff --git a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op_xpu.cc b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op_xpu.cc
@@ -0,0 +1,84 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h"
+#include "paddle/fluid/platform/macros.h"
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class Pow2DecayWithLinearWarmupXPUOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const {
+    const auto *lr = ctx.Input<framework::Tensor>("LearningRate");
+    const auto *step = ctx.Input<framework::Tensor>("Step");
+    auto *lr_out = ctx.Output<framework::Tensor>("LearningRateOut");
+    auto *step_out = ctx.Output<framework::Tensor>("StepOut");
+    PADDLE_ENFORCE_EQ(
+        lr,
+        lr_out,
+        platform::errors::InvalidArgument("Input(LearningRate) and "
+                                          "Output(LearningRateOut) "
+                                          "must be the same."));
+    PADDLE_ENFORCE_NOT_NULL(lr,
+                            platform::errors::InvalidArgument(
+                                "Input(LearingRate) should not be nullptr."));
+    PADDLE_ENFORCE_EQ(step,
+                      step_out,
+                      platform::errors::InvalidArgument(
+                          "Input(Step) and Output(StepOut) must be the same."));
+    PADDLE_ENFORCE_NOT_NULL(step,
+                            platform::errors::InvalidArgument(
+                                "Input(Step) should not be nullptr."));
+    PADDLE_ENFORCE_EQ(
+        step->IsInitialized(),
+        true,
+        platform::errors::InvalidArgument("Input(Step) must be initialized."));
+
+    auto warmup_steps = static_cast<size_t>(ctx.Attr<int64_t>("warmup_steps"));
+    auto total_steps = static_cast<size_t>(ctx.Attr<int64_t>("total_steps"));
+    PADDLE_ENFORCE_LE(warmup_steps,
+                      total_steps,
+                      platform::errors::InvalidArgument(
+                          "warmup_steps must not be larger than total_steps."));
+    auto base_lr = ctx.Attr<float>("base_lr");
+    auto end_lr = ctx.Attr<float>("end_lr");
+
+    auto *lr_data = lr_out->data<T>();
+    auto *step_data = step_out->data<int64_t>();
+    auto &dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
+    int r = xpu::pow2_decay_with_linear_warmup(dev_ctx.x_context(),
+                                               lr_data,
+                                               step_data,
+                                               warmup_steps,
+                                               total_steps,
+                                               base_lr,
+                                               end_lr);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "pow2_decay_with_linear_warmup");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(pow2_decay_with_linear_warmup,
+                       ops::Pow2DecayWithLinearWarmupXPUOpKernel<float>);
+#endif
diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
@@ -71,6 +71,8 @@ XPUOpMap& get_kl2_ops() {
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
       {"clip", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"coalesce_tensor",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"concat_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
@@ -255,6 +257,8 @@ XPUOpMap& get_kl2_ops() {
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"label_smooth",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"lars_momentum",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"layer_norm_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"layer_norm_grad",
@@ -334,6 +338,8 @@ XPUOpMap& get_kl2_ops() {
                      pOpKernelType(vartype::FP16, XPUPlace())})},
       {"pow", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"pow_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"pow2_decay_with_linear_warmup",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"prior_box", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"range",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),

diff --git a/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py b/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py
@@ -88,6 +88,7 @@
     'dropout_float16',
     'dropout_grad_float16',
     "grad_add_float32",  # no api for grad_add, skip
+    "lars_momentum_float32",
     "resnet_unit",
     "resnet_unit_grad"
 ]

diff --git a/python/paddle/fluid/tests/unittests/xpu/test_coalesce_tensor_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_coalesce_tensor_op_xpu.py
@@ -0,0 +1,128 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from paddle.fluid import core
+import sys
+
+sys.path.append("..")
+from op_test import OpTest
+
+alignment = 256
+import paddle
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
+paddle.enable_static()
+
+
+class XPUTestCoalesceTensorOp(XPUOpTestWrapper):
+
+    def __init__(self):
+        self.op_name = 'coalesce_tensor'
+        self.use_dynamic_create_class = False
+
+    class TestAllocContinuousSpace(XPUOpTest):
+
+        def setUp(self):
+            self.op_type = "coalesce_tensor"
+            self.use_xpu = True
+            self.dtype, self.fluid_dtype = self.init_dtype()
+            attrs = self.init_attr()
+            self.copy_data = attrs["copy_data"]
+            self.constant = attrs["constant"]
+            self.set_constant = attrs["set_constant"]
+            self.Inputs = self.init_input()
+            self.Outputs, self.FusedOutput = self.init_output(
+                self.Inputs, self.set_constant, self.constant)
+            self.inputs = {'Input': self.Inputs}
+            self.attrs = attrs
+            self.outputs = {
+                'Output': self.Outputs,
+                'FusedOutput': self.FusedOutput
+            }
+
+        def init_dtype(self):
+            return np.float32, int(core.VarDesc.VarType.FP32)
+
+        def init_input(self):
+            inputs = []
+            inputs.append(("x1", np.random.random([20, 3]).astype(self.dtype)))
+            inputs.append(("x2", np.random.random([20]).astype(self.dtype)))
+            inputs.append(("x3", np.random.random([1]).astype(self.dtype)))
+            inputs.append(("x4", np.random.random([200,
+                                                   30]).astype(self.dtype)))
+            inputs.append(("x5", np.random.random([30]).astype(self.dtype)))
+            inputs.append(("x6", np.random.random([1]).astype(self.dtype)))
+            return inputs
+
+        def init_attr(self):
+            return {
+                "copy_data": True,
+                "set_constant": False,
+                "constant": 0.0,
+                "dtype": self.fluid_dtype
+            }
+
+        def init_output(self, input_list, set_constant, constant):
+            inputs = []
+            outputs = input_list
+
+            for input in input_list:
+                length = len(input[1].flatten())
+                aligned_len = (length + alignment) / alignment * alignment
+                out = np.zeros(int(aligned_len))
+                out[0:length] = input[1].flatten()
+                inputs.append(out)
+
+            coalesce_tensor_var = np.concatenate([input for input in inputs])
+            if set_constant:
+                coalesce_tensor_var = np.ones(
+                    (len(coalesce_tensor_var))) * constant
+                outputs = [(out[0],
+                            np.ones(out[1].shape).astype(self.dtype) * constant)
+                           for out in outputs]
+            return outputs, coalesce_tensor_var
+
+        def test_check_output(self):
+            self.check_output_with_place(place=core.XPUPlace(0),
+                                         no_check_set=["FusedOutput"],
+                                         atol=1e-5)
+
+    class TestAllocContinuousSpace2(TestAllocContinuousSpace):
+
+        def init_attr(self):
+            return {
+                "copy_data": False,
+                "set_constant": True,
+                "constant": 0.5,
+                "dtype": self.fluid_dtype,
+                "user_defined_size_of_dtype": 2
+            }
+
+        def test_check_output(self):
+            self.check_output_with_place(place=core.XPUPlace(0),
+                                         no_check_set=["FusedOutput"],
+                                         atol=1e-5)
+
+
+support_types = get_xpu_op_support_types('coalesce_tensor')
+for stype in support_types:
+    create_test_class(globals(), XPUTestCoalesceTensorOp, stype)
+
+if __name__ == '__main__':
+    unittest.main()