Skip to content

Commit

Permalink
[Allocator] Automatically end memory allocation policy generation
Browse files Browse the repository at this point in the history
  • Loading branch information
yitongh committed May 5, 2022
1 parent 00cadf4 commit f343431
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 39 deletions.
13 changes: 2 additions & 11 deletions docs/CPU-Memory-Optimization.md
Expand Up @@ -3,21 +3,12 @@
## 功能介绍

在 CPU 端,高性能 malloc 库存在大内存分配时带来的 minor pagefault 严重导致的性能问题。该功能能够降低内存的使用量以及 minor pagefault,提升运行性能。
该功能运行时收集信息然后进行优化,所以在运行一定 step 数之后才能观测到性能提升。
该功能运行时收集内存分配信息(在运行step数达到`START_STATISTIC_STEP`阈值后),然后基于每个运行step的内存分配信息生成分配策略。在生成分配策略时会判断之前生成的内存分配策略是否满足当前的分配需求,如果满足视为一个stable的step,在stable的step数目达到`STABLE_STATISTIC_STEP`阈值或者总收集信息的step数达到`MAX_STATISTIC_STEP`阈值时停止收集内存信息。由于需要收集内存分配信息进行优化,所以在运行一定 step 数之后才能观测到性能提升。

## 用户接口

在 CPU 端,目前的 DeepRec 版本支持单机和分布式的内存优化,该优化默认开启,可以使用 `export ENABLE_MEMORY_OPTIMIZATION=0` 命令关闭该优化。
存在两个环境变量:`START_STATISTIC_STEP``STOP_STATISTIC_STEP`,配置开始收集stats的step和结束收集开始优化的step,默认是1000到1100。可以按以下设置减少一开始的冷启动时间。

```bash
export START_STATISTIC_STEP=100
export STOP_STATISTIC_STEP=200
```

一般最少设置开始 step 为100以去除一开始的初始化图。注意这里的 step 和训练的 step 并不一致。
如果初始化图较多,需要相对应提高相应的 start 和 stop step。
大致运行 `STOP_STATISTIC_STEP` 个step之后可以看到运行时间明显变短。
存在上述提及的几个环境变量:`START_STATISTIC_STEP``STABLE_STATISTIC_STEP``MAX_STATISTIC_STEP`,配置开始收集stats的step,分配策略稳定分配多少个step后结束,内存分配策略最多运行多少个step后结束。默认值分别为100、10、100。这几个值一般不需要进行改动,初始化图较多时可以调大`START_STATISTIC_STEP`,图比较混乱或者运行的小子图比较多时可以调大`STABLE_STATISTIC_STEP``MAX_STATISTIC_STEP`

### 使用 jemalloc
CPU 端可以搭配 jemalloc 库使用内存优化。设置 `MALLOC` 环境变量后在 python 命令前添加` LD_PRELOAD` jemalloc 的动态库即可,比如:
Expand Down
98 changes: 73 additions & 25 deletions tensorflow/core/common_runtime/memory_planner.cc
Expand Up @@ -10,8 +10,9 @@
namespace tensorflow {

namespace {
constexpr int64 DEFAULT_START_STATISTIC_STEP = 1000;
constexpr int64 DEFAULT_STOP_STATISTIC_STEP = 1100;
constexpr int64 DEFAULT_START_STATISTIC_STEP = 100;
constexpr int64 DEFAULT_STABLE_STATISTIC_STEP = 10;
constexpr int64 DEFAULT_MAX_STATISTIC_STEP = 100;
}

MemoryPlanner::MemoryPlanner() :
Expand All @@ -20,7 +21,10 @@ MemoryPlanner::MemoryPlanner() :
thread_pool_(nullptr),
counter_(0),
start_step_(DEFAULT_START_STATISTIC_STEP),
stop_step_(DEFAULT_STOP_STATISTIC_STEP) {
stable_step_(DEFAULT_STABLE_STATISTIC_STEP),
max_stat_step_(DEFAULT_MAX_STATISTIC_STEP),
current_stable_step_(0),
current_stat_step_(0) {
InitPolicy();
InitStepInfo();
}
Expand All @@ -41,17 +45,31 @@ void MemoryPlanner::InitStepInfo() {
Status s = ReadInt64FromEnvVar("START_STATISTIC_STEP",
DEFAULT_START_STATISTIC_STEP,
&start_step_);
s = ReadInt64FromEnvVar("STOP_STATISTIC_STEP",
DEFAULT_STOP_STATISTIC_STEP,
&stop_step_);
if (!s.ok()) {
LOG(FATAL) << "Read START_STATISTIC_STEP envrionment error. "
<< s.error_message();
}
s = ReadInt64FromEnvVar("STABLE_STATISTIC_STEP",
DEFAULT_STABLE_STATISTIC_STEP,
&stable_step_);
if (!s.ok()) {
LOG(FATAL) << "Read STABLE_STATISTIC_STEP envrionment error. "
<< s.error_message();
}
s = ReadInt64FromEnvVar("MAX_STATISTIC_STEP",
DEFAULT_MAX_STATISTIC_STEP,
&max_stat_step_);
if (!s.ok()) {
LOG(FATAL) << "Read MAX_STATISTIC_STEP envrionment error. "
<< s.error_message();
}
}

// lifetime policy
LifetimePolicy* MemoryPlanner::BestLifetimePolicy() {
LifetimePolicy* best_policy = nullptr;
auto total_mem = std::numeric_limits<size_t>::max();
for (auto policy : lifetime_stats_polices_) {
policy->BestFit();
auto policy_mem = policy->TotalMem();
if (policy_mem < total_mem) {
best_policy = policy;
Expand All @@ -72,14 +90,35 @@ void MemoryPlanner::StartCollect() {
auto current = counter_.fetch_add(1);
if (current == start_step_) {
is_stats_ = true;
} else if (current == stop_step_) {
is_stats_ = false;
CollectDone();
}
}

void MemoryPlanner::StopCollect() {
// Make sure counter_ load is atomic.
if (is_stats_) {
Schedule([this]() {
// stop collecting stat when generating policy
is_stats_ = false;
++current_stat_step_;
bool stable = true;
for (auto policy : lifetime_stats_polices_) {
if (!policy->BestFit()) {
stable = false;
}
}
if (stable) {
++current_stable_step_;
} else {
current_stable_step_ = 0;
}
if (current_stable_step_ > stable_step_
|| current_stat_step_ > max_stat_step_) {
VLOG(2) << "end planner: " << current_stat_step_;
CollectDone();
} else {
is_stats_ = true;
}
});
}
}

void MemoryPlanner::CollectDone() {
Expand Down Expand Up @@ -250,18 +289,24 @@ void LifetimeBin::TrackDeallocate(AllocStats* stats) {
stats_.emplace_back(stats);
}

void LifetimePolicy::BestFit() {
{
std::lock_guard<spin_lock> l(large_bin_lock_);
for (auto it = large_bins_.rbegin();
it != large_bins_.rend(); ++it) {
auto bin_info = it->second;
bin_info->BestFit(this);
bool LifetimePolicy::BestFit() {
bool stable = true;
std::lock_guard<spin_lock> l(large_bin_lock_);
for (auto it = large_bins_.rbegin();
it != large_bins_.rend(); ++it) {
auto bin_info = it->second;
bool ret = bin_info->BestFit(this);
if (!ret) {
stable = false;
}
}
for (auto it = bins_.rbegin(); it != bins_.rend(); ++it) {
(*it)->BestFit(this);
bool ret = (*it)->BestFit(this);
if (!ret) {
stable = false;
}
}
return stable;
}

std::vector<LifetimeBin*>& LifetimePolicy::GetBins() {
Expand Down Expand Up @@ -301,11 +346,12 @@ size_t LifetimePolicy::Interval() {
return interval_;
}

void LifetimeBin::BestFit(LifetimePolicy* policy) {
bool LifetimeBin::BestFit(LifetimePolicy* policy) {
std::lock_guard<spin_lock> l(stats_lock_);
if (stats_.empty()) {
return;
return true;
}
bool stable = true;
for (auto s : stats_) {
auto block = FindBlock(s);
if (block != nullptr) {
Expand All @@ -323,7 +369,10 @@ void LifetimeBin::BestFit(LifetimePolicy* policy) {
block = new AllocBlock(chunk_size_, bin_index_);
block->Insert(s);
blocks_.emplace_back(block);
stable = false;
}
stats_.clear();
return stable;
}

AllocBlock* LifetimeBin::FindBlock(AllocStats* stats) {
Expand All @@ -349,14 +398,13 @@ size_t LifetimeBin::Alignment() const {

AllocBlock* LifetimePolicy::FindBlock(
AllocStats* stats, size_t bindex) {
for (size_t i = bindex; i < large_bin_index_; ++i) {
auto block = bins_[i]->FindBlock(stats);
for ( ; bindex < large_bin_index_; ++bindex) {
auto block = bins_[bindex]->FindBlock(stats);
if (block != nullptr) {
return block;
}
}
// no need to lock, BestFit already hold large_bin_lock_ firstly
for (auto it = large_bins_.begin();
for (auto it = large_bins_.lower_bound(bindex);
it != large_bins_.end(); ++it) {
auto block = (it->second)->FindBlock(stats);
if (block != nullptr) {
Expand Down
9 changes: 6 additions & 3 deletions tensorflow/core/common_runtime/memory_planner.h
Expand Up @@ -90,7 +90,7 @@ class LifetimeBin {
void TrackDeallocate(AllocStats* stats);
size_t TotalMem() const;
void Dump() const;
void BestFit(LifetimePolicy* policy);
bool BestFit(LifetimePolicy* policy);
void Cleanup();

AllocBlock* FindBlock(AllocStats* stats);
Expand Down Expand Up @@ -128,7 +128,7 @@ class LifetimePolicy {

AllocBlock* FindBlock(AllocStats* stats, size_t bin_index);

void BestFit();
bool BestFit();
size_t Interval();

std::vector<LifetimeBin*>& GetBins();
Expand Down Expand Up @@ -213,7 +213,10 @@ class MemoryPlanner : public MemoryPlannerBase {
// step information
std::atomic<int64_t> counter_;
int64 start_step_;
int64 stop_step_;
int64 stable_step_;
int64 max_stat_step_;
int64 current_stable_step_;
int64 current_stat_step_;
};

class MemoryPlannerFactory {
Expand Down
1 change: 1 addition & 0 deletions tensorflow/core/common_runtime/tensorpool_allocator.cc
Expand Up @@ -143,6 +143,7 @@ void TensorPoolAllocator::Init() {
}
lifetime_bins_[(*it)->BinIndex()] = bin;
}
LOG(INFO) << "TensorPoolAllocator enabled";
inited_ = true;
}
}
Expand Down

0 comments on commit f343431

Please sign in to comment.