Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

rbd-mirror: fix sparse read optimization in image sync #12368

Merged
merged 1 commit into from Dec 9, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
82 changes: 82 additions & 0 deletions src/test/rbd_mirror/test_ImageSync.cc
Expand Up @@ -136,6 +136,88 @@ TEST_F(TestImageSync, Simple) {
}
}

TEST_F(TestImageSync, Resize) {
int64_t object_size = std::min<int64_t>(
m_remote_image_ctx->size, 1 << m_remote_image_ctx->order);

uint64_t off = 0;
uint64_t len = object_size / 10;

std::string str(len, '1');
ASSERT_EQ((int)len, m_remote_image_ctx->aio_work_queue->write(off, len,
str.c_str(), 0));
{
RWLock::RLocker owner_locker(m_remote_image_ctx->owner_lock);
ASSERT_EQ(0, m_remote_image_ctx->flush());
}

ASSERT_EQ(0, create_snap(m_remote_image_ctx, "snap", nullptr));

uint64_t size = object_size - 1;
librbd::NoOpProgressContext no_op_progress_ctx;
ASSERT_EQ(0, m_remote_image_ctx->operations->resize(size, true,
no_op_progress_ctx));

C_SaferCond ctx;
ImageSync<> *request = create_request(&ctx);
request->send();
ASSERT_EQ(0, ctx.wait());

bufferlist read_remote_bl;
read_remote_bl.append(std::string(len, '\0'));
bufferlist read_local_bl;
read_local_bl.append(std::string(len, '\0'));

ASSERT_LE(0, m_remote_image_ctx->aio_work_queue->read(
off, len, read_remote_bl.c_str(), 0));
ASSERT_LE(0, m_local_image_ctx->aio_work_queue->read(
off, len, read_local_bl.c_str(), 0));

ASSERT_TRUE(read_remote_bl.contents_equal(read_local_bl));
}

TEST_F(TestImageSync, Discard) {
int64_t object_size = std::min<int64_t>(
m_remote_image_ctx->size, 1 << m_remote_image_ctx->order);

uint64_t off = 0;
uint64_t len = object_size / 10;

std::string str(len, '1');
ASSERT_EQ((int)len, m_remote_image_ctx->aio_work_queue->write(off, len,
str.c_str(), 0));
{
RWLock::RLocker owner_locker(m_remote_image_ctx->owner_lock);
ASSERT_EQ(0, m_remote_image_ctx->flush());
}

ASSERT_EQ(0, create_snap(m_remote_image_ctx, "snap", nullptr));

ASSERT_EQ((int)len - 2, m_remote_image_ctx->aio_work_queue->discard(off + 1,
len - 2));
{
RWLock::RLocker owner_locker(m_remote_image_ctx->owner_lock);
ASSERT_EQ(0, m_remote_image_ctx->flush());
}

C_SaferCond ctx;
ImageSync<> *request = create_request(&ctx);
request->send();
ASSERT_EQ(0, ctx.wait());

bufferlist read_remote_bl;
read_remote_bl.append(std::string(object_size, '\0'));
bufferlist read_local_bl;
read_local_bl.append(std::string(object_size, '\0'));

ASSERT_LE(0, m_remote_image_ctx->aio_work_queue->read(
off, len, read_remote_bl.c_str(), 0));
ASSERT_LE(0, m_local_image_ctx->aio_work_queue->read(
off, len, read_local_bl.c_str(), 0));

ASSERT_TRUE(read_remote_bl.contents_equal(read_local_bl));
}

TEST_F(TestImageSync, SnapshotStress) {
std::list<std::string> snap_names;

Expand Down
42 changes: 34 additions & 8 deletions src/tools/rbd_mirror/image_sync/ObjectCopyRequest.cc
Expand Up @@ -174,23 +174,48 @@ void ObjectCopyRequest<I>::send_write_object() {

auto &sync_ops = m_snap_sync_ops.begin()->second;
assert(!sync_ops.empty());
uint64_t object_offset;
uint64_t buffer_offset;
librados::ObjectWriteOperation op;
for (auto &sync_op : sync_ops) {
switch (std::get<0>(sync_op)) {
case SYNC_OP_TYPE_WRITE:
object_offset = std::get<1>(sync_op);
buffer_offset = 0;
for(auto it : std::get<4>(sync_op)){
bufferlist tmpbl;
tmpbl.substr_of(std::get<3>(sync_op), buffer_offset, it.second);
op.write(it.first, tmpbl);
op.set_op_flags2(LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL |
LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
buffer_offset += it.second;
dout(20) << ": write op: " << it.first<< "~" << it.second << dendl;
for (auto it : std::get<4>(sync_op)) {
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking you could just use the interval_set, initialize one to the requested offset + length, subtract out the extents within the sparse read result, and issue a zero / truncate ops for the remaining extents. The truncate would only (possibly) apply to the last extent if its offset + length == the object size.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using interval_set was my initial thought too, but with my current approach I preserve operations order (i.e. the next op is for larger offset), while with interval_set approach it looks I would generate write ops first, then zero/truncate ops, i.e. the offset order would be lost. Does it make any difference for the store how ops are ordered?

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While it shouldn't matter, I suppose it's possible that sending them out-of-order could expose some edge condition within the OSDs that is not normally exercised. Probably best to just keep your current implementation to be safe.

if (object_offset < it.first) {
dout(20) << ": zero op: " << object_offset << "~"
<< it.first - object_offset << dendl;
op.zero(object_offset, it.first - object_offset);
}
dout(20) << ": write op: " << it.first << "~" << it.second << dendl;
bufferlist tmpbl;
tmpbl.substr_of(std::get<3>(sync_op), buffer_offset, it.second);
op.write(it.first, tmpbl);
op.set_op_flags2(LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL |
LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
buffer_offset += it.second;
object_offset = it.first + it.second;
}
if (object_offset < std::get<1>(sync_op) + std::get<2>(sync_op)) {
uint64_t sync_op_end = std::get<1>(sync_op) + std::get<2>(sync_op);
assert(sync_op_end <= m_snap_object_sizes[remote_snap_seq]);
if (sync_op_end == m_snap_object_sizes[remote_snap_seq]) {
dout(20) << ": trunc op: " << object_offset << dendl;
op.truncate(object_offset);
m_snap_object_sizes[remote_snap_seq] = object_offset;
} else {
dout(20) << ": zero op: " << object_offset << "~"
<< sync_op_end - object_offset << dendl;
op.zero(object_offset, sync_op_end - object_offset);
}
}
break;
case SYNC_OP_TYPE_TRUNC:
if (std::get<1>(sync_op) > m_snap_object_sizes[remote_snap_seq]) {
// skip (must have been updated in WRITE op case issuing trunc op)
break;
}
dout(20) << ": trunc op: " << std::get<1>(sync_op) << dendl;
op.truncate(std::get<1>(sync_op));
break;
Expand Down Expand Up @@ -353,6 +378,7 @@ void ObjectCopyRequest<I>::compute_diffs() {
bufferlist(),
std::map<uint64_t, uint64_t>());
}
m_snap_object_sizes[end_remote_snap_id] = end_size;
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure what this is protecting against. The same end_size is encoded in the trunc op above.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

... never mind, I see how it's updated in the WRITE op case.

} else {
if (prev_exists) {
// object remove
Expand Down
2 changes: 2 additions & 0 deletions src/tools/rbd_mirror/image_sync/ObjectCopyRequest.h
Expand Up @@ -85,6 +85,7 @@ class ObjectCopyRequest {
typedef std::list<SyncOp> SyncOps;
typedef std::map<librados::snap_t, SyncOps> SnapSyncOps;
typedef std::map<librados::snap_t, uint8_t> SnapObjectStates;
typedef std::map<librados::snap_t, uint64_t> SnapObjectSizes;

ImageCtxT *m_local_image_ctx;
ImageCtxT *m_remote_image_ctx;
Expand All @@ -102,6 +103,7 @@ class ObjectCopyRequest {

SnapSyncOps m_snap_sync_ops;
SnapObjectStates m_snap_object_states;
SnapObjectSizes m_snap_object_sizes;

void send_list_snaps();
void handle_list_snaps(int r);
Expand Down