Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

os/bluestore: fix race condtion during blob spliting #11422

Merged
merged 5 commits into from Oct 11, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
58 changes: 15 additions & 43 deletions src/os/bluestore/BlueStore.cc
Expand Up @@ -1520,6 +1520,7 @@ bool BlueStore::ExtentMap::update(Onode *o, KeyValueDB::Transaction t,
<< p->offset << std::dec << " is " << bl.length()
<< " bytes (was " << p->shard_info->bytes << ") from " << n
<< " extents" << dendl;
assert(p->shard_info->offset == p->offset);
p->shard_info->bytes = bl.length();
p->shard_info->extents = n;
if (!force &&
Expand Down Expand Up @@ -1568,19 +1569,17 @@ void BlueStore::ExtentMap::reshard(Onode *o, uint64_t min_alloc_size)
<< " target " << target << " slop " << slop << dendl;

// reshard
auto ep = extent_map.begin();
auto sp = o->onode.extent_map_shards.begin();
auto esp = o->onode.extent_map_shards.end();
unsigned shard_end = 0;
unsigned estimate = 0;
unsigned offset = 0;
vector<bluestore_onode_t::shard_info> new_shard_info;
unsigned max_blob_end = 0;
while (ep != extent_map.end()) {
dout(30) << " ep " << *ep << dendl;
assert(!ep->blob->is_spanning());
if (shard_end == 0 ||
ep->logical_offset >= shard_end) {
for (auto& e: extent_map) {
dout(30) << " extent " << e << dendl;
assert(!e.blob->is_spanning());
if (shard_end == 0 || e.logical_offset >= shard_end) {
if (sp == esp) {
// inline case
shard_end = o->onode.size;
Expand All @@ -1598,7 +1597,7 @@ void BlueStore::ExtentMap::reshard(Onode *o, uint64_t min_alloc_size)
<< std::dec << dendl;
}
// disfavor shard boundaries that span a blob
bool would_span = (ep->logical_offset < max_blob_end) || ep->blob_offset;
bool would_span = (e.logical_offset < max_blob_end) || e.blob_offset;
if (estimate &&
estimate + extent_avg > target + (would_span ? slop : 0)) {
// new shard
Expand All @@ -1608,19 +1607,18 @@ void BlueStore::ExtentMap::reshard(Onode *o, uint64_t min_alloc_size)
dout(20) << __func__ << " new shard 0x" << std::hex << offset
<< std::dec << dendl;
}
offset = ep->logical_offset;
offset = e.logical_offset;
new_shard_info.emplace_back(bluestore_onode_t::shard_info());
new_shard_info.back().offset = offset;
dout(20) << __func__ << " new shard 0x" << std::hex << offset << std::dec
<< dendl;
estimate = 0;
}
estimate += extent_avg;
uint32_t be = ep->blob_end();
uint32_t be = e.blob_end();
if (be > max_blob_end) {
max_blob_end = be;
}
++ep;
}
o->onode.extent_map_shards.swap(new_shard_info);

Expand All @@ -1631,7 +1629,6 @@ void BlueStore::ExtentMap::reshard(Onode *o, uint64_t min_alloc_size)
// identify spanning blobs
if (!o->onode.extent_map_shards.empty()) {
dout(20) << __func__ << " checking for spanning blobs" << dendl;
auto ep = extent_map.begin();
auto sp = o->onode.extent_map_shards.begin();
auto esp = o->onode.extent_map_shards.end();
unsigned shard_start = 0;
Expand All @@ -1643,9 +1640,9 @@ void BlueStore::ExtentMap::reshard(Onode *o, uint64_t min_alloc_size)
shard_end = sp->offset;
}
int bid = 0;
while (ep != extent_map.end()) {
dout(30) << " ep " << *ep << dendl;
while (ep->logical_offset >= shard_end) {
for (auto& e : extent_map) {
dout(30) << " extent " << e << dendl;
while (e.logical_offset >= shard_end) {
shard_start = shard_end;
++sp;
if (sp == esp) {
Expand All @@ -1656,17 +1653,17 @@ void BlueStore::ExtentMap::reshard(Onode *o, uint64_t min_alloc_size)
dout(30) << __func__ << " shard 0x" << std::hex << shard_start
<< " to 0x" << shard_end << std::dec << dendl;
}
if (ep->blob->id < 0 &&
ep->blob_escapes_range(shard_start, shard_end - shard_start)) {
if (e.blob->id < 0 &&
e.blob_escapes_range(shard_start, shard_end - shard_start)) {
// We have two options: (1) split the blob into pieces at the
// shard boundaries (and adjust extents accordingly), or (2)
// mark it spanning. We prefer to cut the blob if we can. Note that
// we may have to split it multiple times--potentially at every
// shard boundary.
bool must_span = false;
BlobRef b = ep->blob;
BlobRef b = e.blob;
if (b->can_split()) {
uint32_t bstart = ep->logical_offset - ep->blob_offset;
uint32_t bstart = e.logical_offset - e.blob_offset;
uint32_t bend = bstart + b->get_blob().get_logical_length();
for (const auto& sh : shards) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think a simpler fix would be to take the cache lock just for the can_split() writing.empty() check. We have the collection lock so no new writes will race with us.. only IO completion. So if writing is empty now it's safe the drop the lock and assume it will still be empty below when we do the split.

if (bstart < sh.offset && bend > sh.offset) {
Expand Down Expand Up @@ -1696,7 +1693,6 @@ void BlueStore::ExtentMap::reshard(Onode *o, uint64_t min_alloc_size)
dout(20) << __func__ << " adding spanning " << *b << dendl;
}
}
++ep;
}
}
}
Expand Down Expand Up @@ -4344,30 +4340,6 @@ int BlueStore::fsck()
break;
}
used_omap_head.insert(o->onode.omap_head);
// hrm, scan actual key/value pairs?
KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP);
if (!it)
break;
string head, tail;
get_omap_header(o->onode.omap_head, &head);
get_omap_tail(o->onode.omap_head, &tail);
it->lower_bound(head);
while (it->valid()) {
if (it->key() == head) {
dout(30) << __func__ << " got header" << dendl;
} else if (it->key() >= tail) {
dout(30) << __func__ << " reached tail" << dendl;
break;
} else {
string user_key;
decode_omap_key(it->key(), &user_key);
dout(30) << __func__
<< " got " << pretty_binary_string(it->key())
<< " -> " << user_key << dendl;
}
it->next();
}
break;
}
}
}
Expand Down
1 change: 1 addition & 0 deletions src/os/bluestore/BlueStore.h
Expand Up @@ -437,6 +437,7 @@ class BlueStore : public ObjectStore,
}

bool can_split() const {
std::lock_guard<std::recursive_mutex> l(shared_blob->bc.cache->lock);
// splitting a BufferSpace writing list is too hard; don't try.
return shared_blob->bc.writing.empty() && get_blob().can_split();
}
Expand Down
5 changes: 0 additions & 5 deletions src/os/bluestore/bluestore_types.cc
Expand Up @@ -661,11 +661,6 @@ void bluestore_shared_blob_t::decode(bufferlist::iterator& p)
void bluestore_shared_blob_t::dump(Formatter *f) const
{
f->dump_object("ref_map", ref_map);
f->open_array_section("objects");
/*for (auto &o : objects) {
f->dump_object("object", o);
}*/
f->close_section();
}

void bluestore_shared_blob_t::generate_test_instances(
Expand Down
1 change: 0 additions & 1 deletion src/os/bluestore/bluestore_types.h
Expand Up @@ -604,7 +604,6 @@ ostream& operator<<(ostream& out, const bluestore_blob_t& o);
/// shared blob state
struct bluestore_shared_blob_t {
bluestore_extent_ref_map_t ref_map; ///< shared blob extents
//set<ghobject_t> objects; ///< objects referencing these shared blocks (debug)

void encode(bufferlist& bl) const;
void decode(bufferlist::iterator& p);
Expand Down