Skip to content

Commit

Permalink
Merge pull request #10128 from xiexingguo/xxg-wip-bluestore-2016-07-05
Browse files Browse the repository at this point in the history
os/bluestore: introduce power 2 macros for block alignment and rounding

Reviewed-by: Sage Weil <sage@redhat.com>
  • Loading branch information
liewegas committed Jul 7, 2016
2 parents d1a10c5 + 67d9667 commit 956c769
Show file tree
Hide file tree
Showing 3 changed files with 123 additions and 43 deletions.
51 changes: 46 additions & 5 deletions src/include/intarith.h
Expand Up @@ -16,25 +16,66 @@
#define CEPH_INTARITH_H

#ifndef MIN
# define MIN(a,b) ((a) < (b) ? (a):(b))
#define MIN(a,b) ((a) < (b) ? (a):(b))
#endif

#ifndef MAX
# define MAX(a,b) ((a) > (b) ? (a):(b))
#define MAX(a,b) ((a) > (b) ? (a):(b))
#endif

#ifndef DIV_ROUND_UP
# define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
#endif

#ifndef ROUND_UP_TO
# define ROUND_UP_TO(n, d) ((n)%(d) ? ((n)+(d)-(n)%(d)) : (n))
#define ROUND_UP_TO(n, d) ((n)%(d) ? ((n)+(d)-(n)%(d)) : (n))
#endif

#ifndef SHIFT_ROUND_UP
# define SHIFT_ROUND_UP(x,y) (((x)+(1<<(y))-1) >> (y))
#define SHIFT_ROUND_UP(x,y) (((x)+(1<<(y))-1) >> (y))
#endif

/*
* Macro to determine if value is a power of 2
*/
#define ISP2(x) (((x) & ((x) - 1)) == 0)

/*
* Macros for various sorts of alignment and rounding. The "align" must
* be a power of 2. Often times it is a block, sector, or page.
*/

/*
* return x rounded down to an align boundary
* eg, P2ALIGN(1200, 1024) == 1024 (1*align)
* eg, P2ALIGN(1024, 1024) == 1024 (1*align)
* eg, P2ALIGN(0x1234, 0x100) == 0x1200 (0x12*align)
* eg, P2ALIGN(0x5600, 0x100) == 0x5600 (0x56*align)
*/
#define P2ALIGN(x, align) ((x) & -(align))

/*
* return x % (mod) align
* eg, P2PHASE(0x1234, 0x100) == 0x34 (x-0x12*align)
* eg, P2PHASE(0x5600, 0x100) == 0x00 (x-0x56*align)
*/
#define P2PHASE(x, align) ((x) & ((align) - 1))

/*
* return how much space is left in this block (but if it's perfectly
* aligned, return 0).
* eg, P2NPHASE(0x1234, 0x100) == 0xcc (0x13*align-x)
* eg, P2NPHASE(0x5600, 0x100) == 0x00 (0x56*align-x)
*/
#define P2NPHASE(x, align) (-(x) & ((align) - 1))

/*
* return x rounded up to an align boundary
* eg, P2ROUNDUP(0x1234, 0x100) == 0x1300 (0x13*align)
* eg, P2ROUNDUP(0x5600, 0x100) == 0x5600 (0x56*align)
*/
#define P2ROUNDUP(x, align) (-(-(x) & -(align)))

// count trailing zeros.
// NOTE: the builtin is nondeterministic on 0 input
static inline unsigned ctz(unsigned v) {
Expand Down
96 changes: 58 additions & 38 deletions src/os/bluestore/BlueStore.cc
Expand Up @@ -1744,18 +1744,38 @@ int BlueStore::_open_fm(bool create)
dout(1) << __func__ << " pre-fragmenting freespace, using "
<< g_conf->bluestore_debug_prefill << " with max free extent "
<< g_conf->bluestore_debug_prefragment_max << dendl;
uint64_t start = ROUND_UP_TO(reserved, min_alloc_size);
uint64_t start = P2ROUNDUP(reserved, min_alloc_size);
uint64_t max_b = g_conf->bluestore_debug_prefragment_max / min_alloc_size;
float r = g_conf->bluestore_debug_prefill;
while (start < end) {
r /= 1.0 - r;
bool stop = false;

while (!stop && start < end) {
uint64_t l = (rand() % max_b + 1) * min_alloc_size;
if (start + l > end)
if (start + l > end) {
l = end - start;
l = ROUND_UP_TO(l, min_alloc_size);
uint64_t u = 1 + (uint64_t)(r * (double)l / (1.0 - r));
u = ROUND_UP_TO(u, min_alloc_size);
l = P2ALIGN(l, min_alloc_size);
}
assert(start + l <= end);

uint64_t u = 1 + (uint64_t)(r * (double)l);
u = P2ROUNDUP(u, min_alloc_size);
if (start + l + u > end) {
u = end - (start + l);
// trim to align so we don't overflow again
u = P2ALIGN(u, min_alloc_size);
stop = true;
}
assert(start + l + u <= end);

dout(20) << " free 0x" << std::hex << start << "~" << l
<< " use 0x" << u << std::dec << dendl;

if (u == 0) {
// break if u has been trimmed to nothing
break;
}

fm->allocate(start + l, u, t);
start += l + u;
}
Expand Down Expand Up @@ -2016,7 +2036,7 @@ int BlueStore::_open_db(bool create)
g_conf->bluestore_bluefs_gift_ratio);
initial = MAX(initial, g_conf->bluestore_bluefs_min);
// align to bluefs's alloc_size
initial = ROUND_UP_TO(initial, g_conf->bluefs_alloc_size);
initial = P2ROUNDUP(initial, g_conf->bluefs_alloc_size);
initial += g_conf->bluefs_alloc_size - BLUEFS_START;
bluefs->add_block_extent(bluefs_shared_bdev, BLUEFS_START, initial);
bluefs_extents.insert(BLUEFS_START, initial);
Expand Down Expand Up @@ -2294,7 +2314,7 @@ int BlueStore::_balance_bluefs_freespace(vector<bluestore_pextent_t> *extents,

if (gift) {
// round up to alloc size
gift = ROUND_UP_TO(gift, min_alloc_size);
gift = P2ROUNDUP(gift, min_alloc_size);

// hard cap to fit into 32 bits
gift = MIN(gift, 1ull<<31);
Expand Down Expand Up @@ -2325,7 +2345,7 @@ int BlueStore::_balance_bluefs_freespace(vector<bluestore_pextent_t> *extents,
// reclaim from bluefs?
if (reclaim) {
// round up to alloc size
reclaim = ROUND_UP_TO(reclaim, min_alloc_size);
reclaim = P2ROUNDUP(reclaim, min_alloc_size);

// hard cap to fit into 32 bits
reclaim = MIN(reclaim, 1ull<<31);
Expand Down Expand Up @@ -5684,6 +5704,7 @@ void BlueStore::_do_write_small(
dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
<< std::dec << dendl;
assert(length < min_alloc_size);
uint64_t end = offset + length;

bufferlist bl;
blp.copy(length, bl);
Expand All @@ -5703,7 +5724,7 @@ void BlueStore::_do_write_small(
break;
}
int64_t blob = ep->second.blob;
b = c->get_blob(o, ep->second.blob);
b = c->get_blob(o, blob);
if (!b->blob.is_mutable() || b->blob.is_compressed()) {
dout(20) << __func__ << " ignoring immutable " << blob << ": " << *b
<< dendl;
Expand All @@ -5722,15 +5743,16 @@ void BlueStore::_do_write_small(

// can we pad our head/tail out with zeros?
uint64_t chunk_size = b->blob.get_chunk_size(block_size);
uint64_t head_pad = offset % chunk_size;
uint64_t head_pad = P2PHASE(offset, chunk_size);
if (head_pad && o->onode.has_any_lextents(offset - head_pad, chunk_size)) {
head_pad = 0;
}
uint64_t tail_pad =
ROUND_UP_TO(offset + length, chunk_size) - (offset + length);
if (tail_pad && o->onode.has_any_lextents(offset + length, tail_pad)) {

uint64_t tail_pad = P2NPHASE(end, chunk_size);
if (tail_pad && o->onode.has_any_lextents(end, tail_pad)) {
tail_pad = 0;
}

bufferlist padded = bl;
if (head_pad) {
bufferlist z;
Expand Down Expand Up @@ -5784,9 +5806,8 @@ void BlueStore::_do_write_small(
}

// read some data to fill out the chunk?
uint64_t head_read = b_off % chunk_size;
uint64_t tail_read =
ROUND_UP_TO(b_off + b_len, chunk_size) - (b_off + b_len);
uint64_t head_read = P2PHASE(b_off, chunk_size);
uint64_t tail_read = P2NPHASE(b_off + b_len, chunk_size);
if ((head_read || tail_read) &&
(b->blob.get_ondisk_length() >= b_off + b_len + tail_read) &&
head_read + tail_read < min_alloc_size) {
Expand Down Expand Up @@ -5863,12 +5884,12 @@ void BlueStore::_do_write_small(
// new blob.
b = o->blob_map.new_blob(c->cache);
unsigned alloc_len = min_alloc_size;
uint64_t b_off = offset % alloc_len;
uint64_t b_off = P2PHASE(offset, alloc_len);
b->bc.write(txc->seq, b_off, bl, wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
_pad_zeros(&bl, &b_off, block_size);
o->onode.punch_hole(offset, length, &wctx->lex_old);
bluestore_lextent_t& lex = o->onode.extent_map[offset] =
bluestore_lextent_t(b->id, offset % min_alloc_size, length);
bluestore_lextent_t(b->id, P2PHASE(offset, alloc_len), length);
b->blob.ref_map.get(lex.offset, lex.length);
txc->statfs_delta.stored() += lex.length;
dout(20) << __func__ << " lex 0x" << std::hex << offset << std::dec
Expand Down Expand Up @@ -5958,10 +5979,10 @@ int BlueStore::_do_alloc_write(
::encode(chdr, compressed_bl);
compressed_bl.claim_append(t);
uint64_t rawlen = compressed_bl.length();
uint64_t newlen = ROUND_UP_TO(rawlen, min_alloc_size);
uint64_t newlen = P2ROUNDUP(rawlen, min_alloc_size);
uint64_t dstlen = final_length *
g_conf->bluestore_compression_required_ratio;
dstlen = ROUND_UP_TO(dstlen, min_alloc_size);
dstlen = P2ROUNDUP(dstlen, min_alloc_size);
if (newlen <= dstlen && newlen < final_length) {
// Cool. We compressed at least as much as we were hoping to.
// pad out to min_alloc_size
Expand Down Expand Up @@ -6161,28 +6182,27 @@ int BlueStore::_do_write(
// we fall within the same block
_do_write_small(txc, c, o, offset, length, p, &wctx);
} else {
uint64_t head_offset = 0, head_length = 0;
uint64_t middle_offset = 0, middle_length = 0;
uint64_t tail_offset = 0, tail_length = 0;
if (offset % min_alloc_size) {
head_offset = offset;
head_length = min_alloc_size - (offset % min_alloc_size);
assert(head_length < length);
uint64_t head_offset, head_length;
uint64_t middle_offset, middle_length;
uint64_t tail_offset, tail_length;

head_offset = offset;
head_length = P2NPHASE(offset, min_alloc_size);

tail_offset = P2ALIGN(end, min_alloc_size);
tail_length = P2PHASE(end, min_alloc_size);

middle_offset = head_offset + head_length;
middle_length = length - head_length - tail_length;

if (head_length) {
_do_write_small(txc, c, o, head_offset, head_length, p, &wctx);
middle_offset = offset + head_length;
middle_length = length - head_length;
} else {
middle_offset = offset;
middle_length = length;
}
if (end % min_alloc_size) {
tail_length = end % min_alloc_size;
tail_offset = end - tail_length;
middle_length -= tail_length;
}

if (middle_length) {
_do_write_big(txc, c, o, middle_offset, middle_length, p, &wctx);
}

if (tail_length) {
_do_write_small(txc, c, o, tail_offset, tail_length, p, &wctx);
}
Expand Down
19 changes: 19 additions & 0 deletions src/test/test_intarith.cc
Expand Up @@ -61,3 +61,22 @@ TEST(intarith, ctz) {
ASSERT_EQ(20u, ctzll(0xffffffff00000));
ASSERT_EQ(48u, ctzll(0xff000000000000ull));
}

TEST(intarith, p2family) {
ASSERT_TRUE(ISP2(0x100));
ASSERT_FALSE(ISP2(0x1234));

ASSERT_EQ(1024, P2ALIGN(1200, 1024));
ASSERT_EQ(1024, P2ALIGN(1024, 1024));
ASSERT_EQ(0x1200, P2ALIGN(0x1234, 0x100));
ASSERT_EQ(0x5600, P2ALIGN(0x5600, 0x100));

ASSERT_EQ(0x34, P2PHASE(0x1234, 0x100));
ASSERT_EQ(0x00, P2PHASE(0x5600, 0x100));

ASSERT_EQ(0xcc, P2NPHASE(0x1234, 0x100));
ASSERT_EQ(0x00, P2NPHASE(0x5600, 0x100));

ASSERT_EQ(0x1300, P2ROUNDUP(0x1234, 0x100));
ASSERT_EQ(0x5600, P2ROUNDUP(0x5600, 0x100));
}

0 comments on commit 956c769

Please sign in to comment.