Skip to content

Commit

Permalink
os/bluestore: avoid extra dev flush on single device when all io is d…
Browse files Browse the repository at this point in the history
…eferred

If we have no non-deferred IO to flush, and we are running bluefs on a
single shared device, then we can rely on the bluefs flush to make our
current batch of deferred ios stable.

Separate deferred into a "done" and "stable" list.  If we do sync, put
everything from "done" onto "stable".  Otherwise, after we do our kv
commit via bluefs, move "done" to "stable" then.

Signed-off-by: Sage Weil <sage@redhat.com>
  • Loading branch information
liewegas committed Mar 10, 2017
1 parent c8feaba commit 2261ed2
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 19 deletions.
82 changes: 64 additions & 18 deletions src/os/bluestore/BlueStore.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3933,6 +3933,7 @@ int BlueStore::_open_db(bool create)
bluefs->get_block_device_size(BlueFS::BDEV_DB) - BLUEFS_START);
}
bluefs_shared_bdev = BlueFS::BDEV_SLOW;
bluefs_single_shared_device = false;
} else {
bluefs_shared_bdev = BlueFS::BDEV_DB;
}
Expand Down Expand Up @@ -3987,6 +3988,7 @@ int BlueStore::_open_db(bool create)
BDEV_LABEL_BLOCK_SIZE);
}
cct->_conf->set_val("rocksdb_separate_wal_dir", "true");
bluefs_single_shared_device = false;
} else {
cct->_conf->set_val("rocksdb_separate_wal_dir", "false");
}
Expand Down Expand Up @@ -7457,7 +7459,9 @@ void BlueStore::_kv_sync_thread()
std::unique_lock<std::mutex> l(kv_lock);
while (true) {
assert(kv_committing.empty());
if (kv_queue.empty() && deferred_cleanup_queue.empty()) {
if (kv_queue.empty() &&
deferred_done_queue.empty() &&
deferred_stable_queue.empty()) {
if (kv_stop)
break;
dout(20) << __func__ << " sleep" << dendl;
Expand All @@ -7466,22 +7470,59 @@ void BlueStore::_kv_sync_thread()
dout(20) << __func__ << " wake" << dendl;
} else {
deque<TransContext*> kv_submitting;
deque<TransContext*> deferred_cleaning;
deque<TransContext*> deferred_done, deferred_stable;
dout(20) << __func__ << " committing " << kv_queue.size()
<< " submitting " << kv_queue_unsubmitted.size()
<< " cleaning " << deferred_cleanup_queue.size() << dendl;
<< " deferred done " << deferred_done_queue.size()
<< " stable " << deferred_stable_queue.size()
<< dendl;
kv_committing.swap(kv_queue);
kv_submitting.swap(kv_queue_unsubmitted);
deferred_cleaning.swap(deferred_cleanup_queue);
deferred_done.swap(deferred_done_queue);
deferred_stable.swap(deferred_stable_queue);
utime_t start = ceph_clock_now();
l.unlock();

dout(30) << __func__ << " committing txc " << kv_committing << dendl;
dout(30) << __func__ << " submitting txc " << kv_submitting << dendl;
dout(30) << __func__ << " deferred_cleaning txc " << deferred_cleaning << dendl;
dout(30) << __func__ << " committing " << kv_committing << dendl;
dout(30) << __func__ << " submitting " << kv_submitting << dendl;
dout(30) << __func__ << " deferred_done " << deferred_done << dendl;
dout(30) << __func__ << " deferred_stable " << deferred_stable << dendl;

// flush/barrier on block device
bdev->flush();
int num_aios = 0;
for (auto txc : kv_committing) {
if (txc->had_ios) {
++num_aios;
}
}

bool force_flush = false;
// if bluefs is sharing the same device as data (only), then we
// can rely on the bluefs commit to flush the device and make
// deferred aios stable. that means that if we do have done deferred
// txcs AND we are not on a single device, we need to force a flush.
if (!bluefs || (!bluefs_single_shared_device && !deferred_done.empty())) {
force_flush = true;
}
if (kv_committing.empty() && kv_submitting.empty() &&
deferred_stable.empty()) {
force_flush = true; // there's nothing else to commit!
}
if (deferred_aggressive) {
force_flush = true;
}

if (num_aios || force_flush) {
dout(20) << __func__ << " num_aios=" << num_aios
<< " force_flush=" << (int)force_flush
<< ", flushing, deferred done->stable" << dendl;
// flush/barrier on block device
bdev->flush();

// if we flush then deferred done are now deferred stable
deferred_stable.insert(deferred_stable.end(), deferred_done.begin(),
deferred_done.end());
deferred_done.clear();
}

// we will use one final transaction to force a sync
KeyValueDB::Transaction synct = db->get_transaction();
Expand Down Expand Up @@ -7519,9 +7560,11 @@ void BlueStore::_kv_sync_thread()
--txc->osr->kv_committing_serially;
txc->set_kv_submitted();
}
for (auto txc : kv_committing) {
if (txc->had_ios) {
--txc->osr->txc_with_unstable_io;
if (num_aios) {
for (auto txc : kv_committing) {
if (txc->had_ios) {
--txc->osr->txc_with_unstable_io;
}
}
}

Expand All @@ -7542,7 +7585,7 @@ void BlueStore::_kv_sync_thread()
}

// cleanup sync deferred keys
for (auto txc : deferred_cleaning) {
for (auto txc : deferred_stable) {
bluestore_deferred_transaction_t& wt = *txc->deferred_txn;
if (!wt.released.empty()) {
// kraken replay compat only
Expand Down Expand Up @@ -7573,18 +7616,18 @@ void BlueStore::_kv_sync_thread()
utime_t finish = ceph_clock_now();
utime_t dur = finish - start;
dout(20) << __func__ << " committed " << kv_committing.size()
<< " cleaned " << deferred_cleaning.size()
<< " cleaned " << deferred_stable.size()
<< " in " << dur << dendl;
while (!kv_committing.empty()) {
TransContext *txc = kv_committing.front();
assert(txc->state == TransContext::STATE_KV_SUBMITTED);
_txc_state_proc(txc);
kv_committing.pop_front();
}
while (!deferred_cleaning.empty()) {
TransContext *txc = deferred_cleaning.front();
while (!deferred_stable.empty()) {
TransContext *txc = deferred_stable.front();
_txc_state_proc(txc);
deferred_cleaning.pop_front();
deferred_stable.pop_front();
}

if (!deferred_aggressive) {
Expand Down Expand Up @@ -7613,6 +7656,9 @@ void BlueStore::_kv_sync_thread()
}

l.lock();
// previously deferred "done" are now "stable" by virtue of this
// commit cycle.
deferred_stable_queue.swap(deferred_done);
}
}
dout(10) << __func__ << " finish" << dendl;
Expand Down Expand Up @@ -7737,7 +7783,7 @@ int BlueStore::_deferred_finish(TransContext *txc)
txc->osr->qcond.notify_all();
throttle_deferred_ops.put(txc->ops);
throttle_deferred_bytes.put(txc->bytes);
deferred_cleanup_queue.push_back(txc);
deferred_done_queue.push_back(txc);
}

// in the normal case, do not bother waking up the kv thread; it will
Expand Down
4 changes: 3 additions & 1 deletion src/os/bluestore/BlueStore.h
Original file line number Diff line number Diff line change
Expand Up @@ -1652,6 +1652,7 @@ class BlueStore : public ObjectStore,
private:
BlueFS *bluefs = nullptr;
unsigned bluefs_shared_bdev = 0; ///< which bluefs bdev we are sharing
bool bluefs_single_shared_device = true;
KeyValueDB *db = nullptr;
BlockDevice *bdev = nullptr;
std::string freelist_type;
Expand Down Expand Up @@ -1697,7 +1698,8 @@ class BlueStore : public ObjectStore,
deque<TransContext*> kv_queue; ///< ready, already submitted
deque<TransContext*> kv_queue_unsubmitted; ///< ready, need submit by kv thread
deque<TransContext*> kv_committing; ///< currently syncing
deque<TransContext*> deferred_cleanup_queue; ///< deferred done, ready for cleanup
deque<TransContext*> deferred_done_queue; ///< deferred ios done
deque<TransContext*> deferred_stable_queue; ///< deferred ios done + stable

PerfCounters *logger = nullptr;

Expand Down

0 comments on commit 2261ed2

Please sign in to comment.