diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/AutoCompareUtils.hpp b/dipu/torch_dipu/csrc_dipu/aten/ops/AutoCompareUtils.hpp index ab3537950..2077521dd 100644 --- a/dipu/torch_dipu/csrc_dipu/aten/ops/AutoCompareUtils.hpp +++ b/dipu/torch_dipu/csrc_dipu/aten/ops/AutoCompareUtils.hpp @@ -1,35 +1,25 @@ #pragma once +#include #include +#include #include #include +#include #include #include #include +#include +#include #include #include -#include "csrc_dipu/aten/ops/DIPUCopy.hpp" -#include "csrc_dipu/runtime/device/deviceapis.h" +#include "csrc_dipu/aten/ops/OpUtils.hpp" namespace dipu { namespace native { -inline at::Tensor to_cpu_without_diopi(const at::Tensor& in) { - if (in.is_cpu()) { - return in; - } - - at::Tensor out = at::empty_strided(in.sizes(), in.strides(), - in.options().device(c10::Device("cpu"))); - if (in.nbytes() > 0) { - dipu::devapis::memCopyD2H(out.storage().nbytes(), out.data_ptr(), - in.data_ptr()); - } - return out; -} - inline std::string cpu_tensor_to_one_line_string(const at::Tensor& tensor) { /* * This function retrieves the built-in string representation of the input @@ -91,7 +81,7 @@ inline std::string allclose_autocompare(const at::Tensor& tensor_cpu, constexpr double tolerance_absolute = 1e-4; constexpr double tolerance_relative = 1e-5; const at::Tensor& tensor_cpu_from_device = - to_cpu_without_diopi(tensor_device); + toCpuTensorWithoutDiopiCopy(tensor_device); bool passed = at::allclose(tensor_cpu, tensor_cpu_from_device, tolerance_absolute, tolerance_relative, true); if (passed) { diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp b/dipu/torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp index e472351ad..0578d57be 100644 --- a/dipu/torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp +++ b/dipu/torch_dipu/csrc_dipu/aten/ops/OpUtils.hpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -16,6 +17,8 @@ #include #include #include +#include +#include #include #include #include @@ -23,12 +26,28 @@ #include #include "csrc_dipu/runtime/core/DIPUStream.h" +#include "csrc_dipu/runtime/device/deviceapis.h" #include "csrc_dipu/runtime/rthelper.h" #include "csrc_dipu/utils/Log.h" namespace dipu { namespace native { +// avoid infinite recursion when dumpArg() before calling diopiCopy() +inline at::Tensor toCpuTensorWithoutDiopiCopy(const at::Tensor& in) { + if (in.is_cpu()) { + return in; + } + + at::Tensor out = at::empty_strided(in.sizes(), in.strides(), + in.options().device(c10::Device("cpu"))); + if (in.nbytes() > 0) { + dipu::devapis::memCopyD2H(out.storage().nbytes(), out.data_ptr(), + in.data_ptr()); + } + return out; +} + inline bool checkTensorDevice() { static bool enable = []() { const char* env_ptr = std::getenv("DIPU_CHECK_TENSOR_DEVICE"); @@ -114,7 +133,7 @@ inline std::string dumpArg(const at::Tensor& tensor) { << ", storage_data_ptr: " << tensor.storage().data_ptr().get() << ", storage_offset: " << tensor.storage_offset(); if (dumpOpArgLevel() > 2) { - stream << '\n' << tensor; + stream << '\n' << toCpuTensorWithoutDiopiCopy(tensor); } } else { stream << "undefined";