Skip to content

Commit

Permalink
Add code of occupancy computing on DCU and avoid threadID bug for DCU…
Browse files Browse the repository at this point in the history
… profiler (PaddlePaddle#44520)
  • Loading branch information
yuguo-Jack authored and Aurelius84 committed Jul 29, 2022
1 parent 939e645 commit cf627bc
Show file tree
Hide file tree
Showing 10 changed files with 108 additions and 3 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,7 @@ endif()
if(WITH_ROCM)
include(hip)
include(miopen) # set miopen libraries, must before configure
include(cupti)
endif()

if(WITH_XPU_KP)
Expand Down
7 changes: 7 additions & 0 deletions cmake/configure.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,13 @@ elseif(WITH_ROCM)
add_definitions(-DEIGEN_USE_GPU)
add_definitions(-DEIGEN_USE_HIP)

if(CUPTI_FOUND)
include_directories(${CUPTI_INCLUDE_DIR})
add_definitions(-DPADDLE_WITH_CUPTI)
else()
message(STATUS "Cannot find CUPTI, GPU Profiling is incorrect.")
endif()

if(NOT MIOPEN_FOUND)
message(FATAL_ERROR "Paddle needs MIOpen to compile")
endif()
Expand Down
2 changes: 1 addition & 1 deletion cmake/cupti.cmake
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
if(NOT WITH_GPU)
if(NOT WITH_GPU AND NOT WITH_ROCM)
return()
endif()

Expand Down
3 changes: 3 additions & 0 deletions paddle/fluid/platform/dynload/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ if(NOT APPLE)
if(WITH_RCCL)
list(APPEND HIP_SRCS rccl.cc)
endif()
if(CUPTI_FOUND)
list(APPEND HIP_SRCS cupti.cc)
endif()
endif()
endif()

Expand Down
15 changes: 15 additions & 0 deletions paddle/fluid/platform/profiler/chrometracing_logger.cc
Original file line number Diff line number Diff line change
Expand Up @@ -401,7 +401,11 @@ void ChromeTracingLogger::HandleTypeKernel(
float warps_per_sm = 0.0;
float occupancy = 0.0;
#if defined(PADDLE_WITH_CUPTI)
#ifdef PADDLE_WITH_HIP
constexpr int threads_per_warp = 64;
#else
constexpr int threads_per_warp = 32;
#endif
const gpuDeviceProp& device_property =
GetDeviceProperties(device_node.DeviceId());
blocks_per_sm = static_cast<float>(kernel_info.grid_x * kernel_info.grid_y *
Expand All @@ -411,6 +415,15 @@ void ChromeTracingLogger::HandleTypeKernel(
blocks_per_sm *
(kernel_info.block_x * kernel_info.block_y * kernel_info.block_z) /
threads_per_warp;
#ifdef PADDLE_WITH_HIP
occupancy = CalculateEstOccupancy(device_node.DeviceId(),
kernel_info.dynamic_shared_memory,
kernel_info.block_x,
kernel_info.block_y,
kernel_info.block_z,
kernel_info.kernelFunc,
kernel_info.launchType);
#else
occupancy = CalculateEstOccupancy(device_node.DeviceId(),
kernel_info.registers_per_thread,
kernel_info.static_shared_memory,
Expand All @@ -419,6 +432,8 @@ void ChromeTracingLogger::HandleTypeKernel(
kernel_info.block_y,
kernel_info.block_z,
blocks_per_sm);
#endif // PADDLE_WITH_HIP

#endif
float dur = nsToMsFloat(device_node.Duration());
std::string dur_display;
Expand Down
8 changes: 8 additions & 0 deletions paddle/fluid/platform/profiler/cupti_data_process.cc
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,10 @@ void AddKernelRecord(const CUpti_ActivityKernel4* kernel,
event.kernel_info.queued = kernel->queued;
event.kernel_info.submitted = kernel->submitted;
event.kernel_info.completed = kernel->completed;
#ifdef PADDLE_WITH_HIP
event.kernel_info.kernelFunc = kernel->kernelFunc;
event.kernel_info.launchType = kernel->launchType;
#endif
collector->AddDeviceEvent(std::move(event));
}

Expand Down Expand Up @@ -279,7 +283,11 @@ void AddApiRecord(const CUpti_ActivityAPI* api,
} else {
tid = iter->second;
}
#ifdef PADDLE_WITH_HIP
event.thread_id = api->threadId;
#else
event.thread_id = tid;
#endif
event.correlation_id = api->correlationId;
event.callback_id = api->cbid;
collector->AddRuntimeEvent(std::move(event));
Expand Down
4 changes: 4 additions & 0 deletions paddle/fluid/platform/profiler/trace_event.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,10 @@ struct KernelEventInfo {
uint64_t submitted;
// The completed timestamp for the kernel execution, in ns.
uint64_t completed;
#ifdef PADDLE_WITH_HIP
void* kernelFunc;
uint8_t launchType;
#endif
};

static constexpr size_t kMemKindMaxLen = 50;
Expand Down
56 changes: 55 additions & 1 deletion paddle/fluid/platform/profiler/utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,58 @@ std::string json_vector<std::string>(
}

#ifdef PADDLE_WITH_CUPTI

#ifdef PADDLE_WITH_HIP

#include "hip/hip_runtime.h"
float CalculateEstOccupancy(uint32_t DeviceId,
int32_t DynamicSharedMemory,
int32_t BlockX,
int32_t BlockY,
int32_t BlockZ,
void* kernelFunc,
uint8_t launchType) {
float occupancy = 0.0;
std::vector<int> device_ids = GetSelectedDevices();
if (DeviceId < device_ids.size()) {
const gpuDeviceProp& device_property = GetDeviceProperties(DeviceId);
int blockSize = BlockX * BlockY * BlockZ;
int numBlock = 0;
hipError_t status;
if (launchType == 0) {
status = hipOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlock, kernelFunc, blockSize, DynamicSharedMemory);
if (status == hipSuccess) {
occupancy = static_cast<double>(numBlock) * blockSize /
device_property.maxThreadsPerMultiProcessor;
} else {
LOG(WARNING) << "Failed to calculate estimated occupancy, status = "
<< status << std::endl;
}
} else if (launchType == 100) {
status = hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlock,
reinterpret_cast<hipFunction_t>(kernelFunc),
blockSize,
DynamicSharedMemory);
if (status == hipSuccess) {
occupancy = static_cast<double>(numBlock) * blockSize /
device_property.maxThreadsPerMultiProcessor;
} else {
LOG(WARNING) << "Failed to calculate estimated occupancy, status = "
<< status << std::endl;
}
} else {
LOG(WARNING) << "Failed to calculate estimated occupancy, can not "
"recognize launchType : "
<< launchType << std::endl;
}
}
return occupancy;
}

#else

float CalculateEstOccupancy(uint32_t DeviceId,
uint16_t RegistersPerThread,
int32_t StaticSharedMemory,
Expand Down Expand Up @@ -88,7 +140,9 @@ float CalculateEstOccupancy(uint32_t DeviceId,
}
return occupancy;
}
#endif
#endif // PADDLE_WITH_HIP

#endif // PADDLE_WITH_CUPTI

const char* StringTracerMemEventType(TracerMemEventType type) {
static const char* categary_name_[] = {
Expand Down
12 changes: 11 additions & 1 deletion paddle/fluid/platform/profiler/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,15 @@ static float nsToMsFloat(uint64_t end_ns, uint64_t start_ns = 0) {
}

#ifdef PADDLE_WITH_CUPTI
#ifdef PADDLE_WITH_HIP
float CalculateEstOccupancy(uint32_t DeviceId,
int32_t DynamicSharedMemory,
int32_t BlockX,
int32_t BlockY,
int32_t BlockZ,
void* kernelFunc,
uint8_t launchType);
#else
float CalculateEstOccupancy(uint32_t deviceId,
uint16_t registersPerThread,
int32_t staticSharedMemory,
Expand All @@ -133,7 +142,8 @@ float CalculateEstOccupancy(uint32_t deviceId,
int32_t blockY,
int32_t blockZ,
float blocksPerSm);
#endif
#endif // PADDLE_WITH_HIP
#endif // PADDLE_WITH_CUPTI

} // namespace platform
} // namespace paddle
3 changes: 3 additions & 0 deletions paddle/phi/backends/dynload/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ if(NOT APPLE)
if(WITH_RCCL)
list(APPEND HIP_SRCS rccl.cc)
endif()
if(CUPTI_FOUND)
list(APPEND HIP_SRCS cupti.cc)
endif()
endif()
endif()

Expand Down

0 comments on commit cf627bc

Please sign in to comment.