Skip to content

Commit

Permalink
Implement a first draft of the ROCmService
Browse files Browse the repository at this point in the history
  • Loading branch information
fwyzard committed Feb 1, 2023
1 parent bbcce06 commit 4f5ff35
Show file tree
Hide file tree
Showing 9 changed files with 761 additions and 0 deletions.
11 changes: 11 additions & 0 deletions HeterogeneousCore/ROCmServices/BuildFile.xml
@@ -0,0 +1,11 @@
<iftool name="rocm">
<use name="rocm"/>
<use name="FWCore/MessageLogger"/>
<use name="FWCore/ParameterSet"/>
<use name="FWCore/ServiceRegistry"/>
<use name="FWCore/Utilities"/>
<use name="HeterogeneousCore/ROCmUtilities" source_only="true"/>
<export>
<lib name="1"/>
</export>
</iftool>
45 changes: 45 additions & 0 deletions HeterogeneousCore/ROCmServices/interface/ROCmService.h
@@ -0,0 +1,45 @@
#ifndef HeterogeneousCore_ROCmServices_interface_ROCmService_h
#define HeterogeneousCore_ROCmServices_interface_ROCmService_h

#include <utility>
#include <vector>

#include "FWCore/Utilities/interface/StreamID.h"

namespace edm {
class ParameterSet;
class ActivityRegistry;
class ConfigurationDescriptions;
} // namespace edm

class ROCmService {
public:
ROCmService(edm::ParameterSet const& config);
~ROCmService();

static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);

bool enabled() const { return enabled_; }

int numberOfDevices() const { return numberOfDevices_; }

// major, minor
std::pair<int, int> computeCapability(int device) const { return computeCapabilities_.at(device); }

// Returns the id of device with most free memory. If none is found, returns -1.
int deviceWithMostFreeMemory() const;

private:
int numberOfDevices_ = 0;
std::vector<std::pair<int, int>> computeCapabilities_;
bool enabled_ = false;
bool verbose_ = false;
};

namespace edm {
namespace service {
inline bool isProcessWideService(ROCmService const*) { return true; }
} // namespace service
} // namespace edm

#endif // HeterogeneousCore_ROCmServices_interface_ROCmService_h
12 changes: 12 additions & 0 deletions HeterogeneousCore/ROCmServices/plugins/BuildFile.xml
@@ -0,0 +1,12 @@
<iftool name="rocm">
<use name="rocm"/>
<use name="DataFormats/Provenance"/>
<use name="FWCore/MessageLogger"/>
<use name="FWCore/ParameterSet"/>
<use name="FWCore/ServiceRegistry"/>
<use name="HeterogeneousCore/ROCmServices"/>
<use name="HeterogeneousCore/ROCmUtilities" source_only="true"/>
<library file="*.cc" name="HeterogeneousCoreROCmServicesPlugins">
<flags EDM_PLUGIN="1"/>
</library>
</iftool>
120 changes: 120 additions & 0 deletions HeterogeneousCore/ROCmServices/plugins/ROCmMonitoringService.cc
@@ -0,0 +1,120 @@
#include <iostream>

#include <hip/hip_runtime.h>

#include "DataFormats/Provenance/interface/ModuleDescription.h"
#include "FWCore/MessageLogger/interface/MessageLogger.h"
#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
#include "FWCore/ParameterSet/interface/ParameterSet.h"
#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
#include "FWCore/ServiceRegistry/interface/ActivityRegistry.h"
#include "FWCore/ServiceRegistry/interface/ModuleCallingContext.h"
#include "FWCore/ServiceRegistry/interface/Service.h"
#include "FWCore/ServiceRegistry/interface/ServiceMaker.h"
#include "HeterogeneousCore/ROCmServices/interface/ROCmService.h"
#include "HeterogeneousCore/ROCmUtilities/interface/hipCheck.h"

namespace edm {
class StreamContext;
}

class ROCmMonitoringService {
public:
ROCmMonitoringService(edm::ParameterSet const& iConfig, edm::ActivityRegistry& iRegistry);
~ROCmMonitoringService() = default;

static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);

void postModuleConstruction(edm::ModuleDescription const& desc);
void postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc);
void postModuleEvent(edm::StreamContext const& sc, edm::ModuleCallingContext const& mcc);
void postEvent(edm::StreamContext const& sc);

private:
int numberOfDevices_ = 0;
};

ROCmMonitoringService::ROCmMonitoringService(edm::ParameterSet const& config, edm::ActivityRegistry& registry) {
// make sure that ROCm is initialised, and that the ROCmService destructor is called after this service's destructor
edm::Service<ROCmService> rocmService;
if (!rocmService->enabled())
return;
numberOfDevices_ = rocmService->numberOfDevices();

if (config.getUntrackedParameter<bool>("memoryConstruction")) {
registry.watchPostModuleConstruction(this, &ROCmMonitoringService::postModuleConstruction);
}
if (config.getUntrackedParameter<bool>("memoryBeginStream")) {
registry.watchPostModuleBeginStream(this, &ROCmMonitoringService::postModuleBeginStream);
}
if (config.getUntrackedParameter<bool>("memoryPerModule")) {
registry.watchPostModuleEvent(this, &ROCmMonitoringService::postModuleEvent);
}
if (config.getUntrackedParameter<bool>("memoryPerEvent")) {
registry.watchPostEvent(this, &ROCmMonitoringService::postEvent);
}
}

void ROCmMonitoringService::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
edm::ParameterSetDescription desc;

desc.addUntracked<bool>("memoryConstruction", false)
->setComment("Print memory information for each device after the construction of each module");
desc.addUntracked<bool>("memoryBeginStream", true)
->setComment("Print memory information for each device after the beginStream() of each module");
desc.addUntracked<bool>("memoryPerModule", true)
->setComment("Print memory information for each device after the event of each module");
desc.addUntracked<bool>("memoryPerEvent", true)
->setComment("Print memory information for each device after each event");

descriptions.add("ROCmMonitoringService", desc);
descriptions.setComment(
"The memory information is the global state of the device. This gets confusing if there are multiple processes "
"running on the same device. Probably the information retrieval should be re-thought?");
}

// activity handlers
namespace {
template <typename T>
void dumpUsedMemory(T& log, int num) {
int old = 0;
hipCheck(hipGetDevice(&old));
constexpr auto mbytes = 1 << 20;
for (int i = 0; i < num; ++i) {
size_t freeMemory, totalMemory;
hipCheck(hipSetDevice(i));
hipCheck(hipMemGetInfo(&freeMemory, &totalMemory));
log << "\n"
<< i << ": " << (totalMemory - freeMemory) / mbytes << " MB used / " << totalMemory / mbytes << " MB total";
}
hipCheck(hipSetDevice(old));
}
} // namespace

void ROCmMonitoringService::postModuleConstruction(edm::ModuleDescription const& desc) {
auto log = edm::LogPrint("ROCmMonitoringService");
log << "ROCm device memory after construction of " << desc.moduleLabel() << " (" << desc.moduleName() << ")";
dumpUsedMemory(log, numberOfDevices_);
}

void ROCmMonitoringService::postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) {
auto log = edm::LogPrint("ROCmMonitoringService");
log << "ROCm device memory after beginStream() of " << mcc.moduleDescription()->moduleLabel() << " ("
<< mcc.moduleDescription()->moduleName() << ")";
dumpUsedMemory(log, numberOfDevices_);
}

void ROCmMonitoringService::postModuleEvent(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) {
auto log = edm::LogPrint("ROCmMonitoringService");
log << "ROCm device memory after processing an event by " << mcc.moduleDescription()->moduleLabel() << " ("
<< mcc.moduleDescription()->moduleName() << ")";
dumpUsedMemory(log, numberOfDevices_);
}

void ROCmMonitoringService::postEvent(edm::StreamContext const& sc) {
auto log = edm::LogPrint("ROCmMonitoringService");
log << "ROCm device memory after event";
dumpUsedMemory(log, numberOfDevices_);
}

DEFINE_FWK_SERVICE(ROCmMonitoringService);
4 changes: 4 additions & 0 deletions HeterogeneousCore/ROCmServices/plugins/plugins.cc
@@ -0,0 +1,4 @@
#include "HeterogeneousCore/ROCmServices/interface/ROCmService.h"
#include "FWCore/ServiceRegistry/interface/ServiceMaker.h"

DEFINE_FWK_SERVICE_MAKER(ROCmService, edm::serviceregistry::ParameterSetMaker<ROCmService>);

0 comments on commit 4f5ff35

Please sign in to comment.