Skip to content

Commit

Permalink
vinyl: add bloom_fpr optimal calculation
Browse files Browse the repository at this point in the history
The optimal bloom_fpr is calculated
using transformed optimization from Monkey:
https://stratos.seas.harvard.edu/files/stratos/files/monkeykeyvaluestore.pdf
article: the calculation based not on a run's level
in LSM tree, but on it's size.

See also tarantool#3969
NO_DOC=internal
NO_TEST=internal
  • Loading branch information
EmirVildanov committed Sep 19, 2022
1 parent 3733ff2 commit 6db4f25
Show file tree
Hide file tree
Showing 14 changed files with 201 additions and 6 deletions.
3 changes: 3 additions & 0 deletions changelogs/unreleased/gh-3969-set-bloom-fpr-automatically.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
## feature/vinyl

* Made bloom filters calculation optimized by run's level in LSM-tree (gh-3969).
6 changes: 6 additions & 0 deletions src/box/alter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,12 @@ index_opts_decode(struct index_opts *opts, const char *map,
"less than or equal to 1");
return -1;
}
if (opts->lookup_cost_coeff <= 0 || opts->lookup_cost_coeff > 1) {
diag_set(ClientError, ER_WRONG_INDEX_OPTIONS,
"lookup_cost_coeff must be greater than 0 and "
"less than or equal to 1");
return -1;
}
return 0;
}

Expand Down
5 changes: 5 additions & 0 deletions src/box/box.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1241,6 +1241,7 @@ box_check_vinyl_options(void)
int run_count_per_level = cfg_geti("vinyl_run_count_per_level");
double run_size_ratio = cfg_getd("vinyl_run_size_ratio");
double bloom_fpr = cfg_getd("vinyl_bloom_fpr");
double lookup_cost_coeff = cfg_getd("vinyl_lookup_cost_coeff");

if (box_check_memory_quota("vinyl_memory") < 0)
diag_raise();
Expand Down Expand Up @@ -1270,6 +1271,10 @@ box_check_vinyl_options(void)
tnt_raise(ClientError, ER_CFG, "vinyl_bloom_fpr",
"must be greater than 0 and less than or equal to 1");
}
if (lookup_cost_coeff <= 0 || lookup_cost_coeff > 1) {
tnt_raise(ClientError, ER_CFG, "vinyl_lookup_cost_coeff",
"must be greater than 0 and less than or equal to 1");
}
}

static int
Expand Down
2 changes: 2 additions & 0 deletions src/box/index_def.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ const struct index_opts index_opts_default = {
/* .run_count_per_level = */ 2,
/* .run_size_ratio = */ 3.5,
/* .bloom_fpr = */ 0.05,
/* .lookup_cost_coeff = */ 0.5,
/* .lsn = */ 0,
/* .stat = */ NULL,
/* .func = */ 0,
Expand All @@ -64,6 +65,7 @@ const struct opt_def index_opts_reg[] = {
OPT_DEF("run_count_per_level", OPT_INT64, struct index_opts, run_count_per_level),
OPT_DEF("run_size_ratio", OPT_FLOAT, struct index_opts, run_size_ratio),
OPT_DEF("bloom_fpr", OPT_FLOAT, struct index_opts, bloom_fpr),
OPT_DEF("lookup_cost_coeff", OPT_FLOAT, struct index_opts, lookup_cost_coeff),
OPT_DEF("lsn", OPT_INT64, struct index_opts, lsn),
OPT_DEF("func", OPT_UINT32, struct index_opts, func_id),
OPT_DEF_LEGACY("sql"),
Expand Down
12 changes: 11 additions & 1 deletion src/box/index_def.h
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,16 @@ struct index_opts {
* previous one.
*/
double run_size_ratio;
/* Bloom filter false positive rate. */
/* Bloom filter false positive rate.
* Used only as default parameter when rebuilding index
* See vy_run_rebuild_index()
*/
double bloom_fpr;
/* Coefficient of worst-case read scenario:
* the lower parameter, the higher read performance
* with more memory consumption.
*/
double lookup_cost_coeff;
/**
* LSN from the time of index creation.
*/
Expand Down Expand Up @@ -213,6 +221,8 @@ index_opts_cmp(const struct index_opts *o1, const struct index_opts *o2)
return o1->run_size_ratio < o2->run_size_ratio ? -1 : 1;
if (o1->bloom_fpr != o2->bloom_fpr)
return o1->bloom_fpr < o2->bloom_fpr ? -1 : 1;
if (o1->lookup_cost_coeff != o2->lookup_cost_coeff)
return o1->lookup_cost_coeff < o2->lookup_cost_coeff ? -1 : 1;
if (o1->func_id != o2->func_id)
return o1->func_id - o2->func_id;
if (o1->hint != o2->hint)
Expand Down
2 changes: 2 additions & 0 deletions src/box/lua/load_cfg.lua
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ local default_cfg = {
vinyl_range_size = nil, -- set automatically
vinyl_page_size = 8 * 1024,
vinyl_bloom_fpr = 0.05,
vinyl_lookup_cost_coeff = 0.5,

-- logging options are covered by
-- a separate log module; they are
Expand Down Expand Up @@ -190,6 +191,7 @@ local template_cfg = {
vinyl_range_size = 'number',
vinyl_page_size = 'number',
vinyl_bloom_fpr = 'number',
vinyl_lookup_cost_coeff = 'number',

log = 'module',
log_nonblock = 'module',
Expand Down
5 changes: 4 additions & 1 deletion src/box/lua/schema.lua
Original file line number Diff line number Diff line change
Expand Up @@ -1456,6 +1456,7 @@ local index_options = {
range_size = 'number',
page_size = 'number',
bloom_fpr = 'number',
lookup_cost_coeff = 'number',
func = 'number, string',
hint = 'boolean',
}
Expand Down Expand Up @@ -1551,7 +1552,8 @@ box.schema.index.create = function(space_id, name, options)
range_size = box.cfg.vinyl_range_size,
run_count_per_level = box.cfg.vinyl_run_count_per_level,
run_size_ratio = box.cfg.vinyl_run_size_ratio,
bloom_fpr = box.cfg.vinyl_bloom_fpr
bloom_fpr = box.cfg.vinyl_bloom_fpr,
lookup_cost_coeff = box.cfg.vinyl_lookup_cost_coeff
}
else
options_defaults = {}
Expand Down Expand Up @@ -1604,6 +1606,7 @@ box.schema.index.create = function(space_id, name, options)
run_count_per_level = options.run_count_per_level,
run_size_ratio = options.run_size_ratio,
bloom_fpr = options.bloom_fpr,
lookup_cost_coeff = options.lookup_cost_coeff,
func = options.func,
hint = options.hint,
}
Expand Down
3 changes: 3 additions & 0 deletions src/box/lua/space.cc
Original file line number Diff line number Diff line change
Expand Up @@ -562,6 +562,9 @@ lbox_fillspace(struct lua_State *L, struct space *space, int i)
lua_pushnumber(L, index_opts->bloom_fpr);
lua_setfield(L, -2, "bloom_fpr");

lua_pushnumber(L, index_opts->lookup_cost_coeff);
lua_setfield(L, -2, "lookup_cost_coeff");

lua_settable(L, -3);
}
lua_setfield(L, -2, index_def->name);
Expand Down
5 changes: 4 additions & 1 deletion src/box/sql.c
Original file line number Diff line number Diff line change
Expand Up @@ -1173,7 +1173,7 @@ sql_encode_index_opts(struct region *region, const struct index_opts *opts,
* page_size etc.
*/
uint8_t current_engine = current_session()->sql_default_engine;
uint32_t map_sz = current_engine == SQL_STORAGE_ENGINE_VINYL ? 6 : 1;
uint32_t map_sz = current_engine == SQL_STORAGE_ENGINE_VINYL ? 7 : 1;
mpstream_encode_map(&stream, map_sz);
mpstream_encode_str(&stream, "unique");
mpstream_encode_bool(&stream, opts->is_unique);
Expand All @@ -1188,6 +1188,9 @@ sql_encode_index_opts(struct region *region, const struct index_opts *opts,
mpstream_encode_double(&stream, cfg_getd("vinyl_run_size_ratio"));
mpstream_encode_str(&stream, "bloom_fpr");
mpstream_encode_double(&stream, cfg_getd("vinyl_bloom_fpr"));
mpstream_encode_str(&stream, "lookup_cost_coeff");
mpstream_encode_double(&stream,
cfg_getd("vinyl_lookup_cost_coeff"));
}
mpstream_flush(&stream);
if (is_error) {
Expand Down
1 change: 1 addition & 0 deletions src/box/vy_range.c
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,7 @@ vy_range_update_compaction_priority(struct vy_range *range,
slice = rlist_last_entry(&range->slices, struct vy_slice, in_range);
size = MAX(slice->count.bytes, 1);
slice = rlist_first_entry(&range->slices, struct vy_slice, in_range);
/** Should we change it to while cycle? */
do {
target_run_size = size;
size = DIV_ROUND_UP(target_run_size, opts->run_size_ratio);
Expand Down
158 changes: 155 additions & 3 deletions src/box/vy_scheduler.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
*/
#include "vy_scheduler.h"

#include <math.h>
#include <assert.h>
#include <stdarg.h>
#include <stdbool.h>
Expand Down Expand Up @@ -1116,6 +1117,140 @@ vy_task_write_run(struct vy_task *task, bool no_compression)
return -1;
}

/**
* Table of symbols:
* R - Worst-case zero-result point lookup cost (in runs);
* W_u - number of unfiltered runs;
* W_f - number of filtered runs;
* t_r - Adopted run_size_ratio_coeff.
*
* Note:
* We should consider the fact that
* current range doesn't yet contains newly created run.
*
* @param lsm
* @param range
* @param new_run_stmts_count statements count in the newly created after
* dump/compaction run
* @param W total number of runs in the range
* after operation (dump/compaction)
* @param total_stmts_count total number of statements in the range
* after operation (dump/compaction)
* @param skip_n number of runs to skip during the range traverse
* as they are compacted into one new run.
* @return optimally calculated bloom_fpr
*/
double
vy_task_find_range_bloom_fpr(
struct vy_lsm *lsm,
struct vy_range *range,
uint64_t new_run_stmts_count,
int W,
uint64_t total_stmts_count,
int skip_n)
{
double R = W * lsm->opts.lookup_cost_coeff;

if (new_run_stmts_count == total_stmts_count) {
/** By definition of R. May also look at formulas ahead.
* R <= 1 in this case as soon as W = 1.
* It also means that skip_n == range->slice_count.
*/
return R;
}

struct vy_slice *last_slice =
rlist_last_entry(&range->slices, struct vy_slice, in_range);
uint64_t last_slice_stmts_count = last_slice->count.rows;

if (new_run_stmts_count > last_slice_stmts_count) {
/** The only case is when the size of dumped run
* is bigger than the size of the last run.
* Returning default bloom_fpr value,
* cause otherwise formula breaks.
* update_compaction_priority will understand that
* major compaction must be called and compaciton will create
* new run with optimal bloom_fpr
*/
return lsm->opts.bloom_fpr;
}

uint64_t scnd_last_slice_stmts_count = 0;

if (skip_n == range->slice_count - 1) {
/** In both dump and compaction cases it means
* we only have the new run except the last run.
*/
scnd_last_slice_stmts_count = new_run_stmts_count;
} else {
struct vy_slice *scnd_last_slice = rlist_prev_entry(
last_slice, in_range);
scnd_last_slice_stmts_count = scnd_last_slice->count.rows;
}

int W_u = 0;
int W_f = 0;
double t_r = 0;

if (R <= (total_stmts_count / last_slice_stmts_count)) {
W_u = 0;
} else {
t_r = (double)scnd_last_slice_stmts_count /
last_slice_stmts_count;
W_u = MAX(0, floor((R - t_r * W) / (1 - t_r)));
}
if (W_u == W) {
/** As soon as all runs should be unfiltered. */
return 1;
}
W_f = W - W_u;

int slice_number = 0;
uint64_t filtered_stmts_count_sum = new_run_stmts_count;
struct vy_slice *slice;
rlist_foreach_entry(slice, &range->slices, in_range) {
if (slice_number++ < skip_n) {
continue;
}

filtered_stmts_count_sum += slice->count.rows;

if (slice_number == W_f) {
break;
}
}

double bloom_fpr = (R - W_u) * new_run_stmts_count /
filtered_stmts_count_sum;
return bloom_fpr;
}

/*
* Logic:
* Let's consider our dump is divided evenly into all lsm ranges.
* So everything we have to do is to find optimized bloom_fpr
* for small dump part in one of lsm ranges.
*
* @param lsm
* @param new_run_stmts_count statements count in the newly created after
* dump run
* @return optimally calculated bloom_fpr for dump
*/
double
vy_task_find_lsm_bloom_fpr(
struct vy_lsm *lsm,
uint64_t new_run_stmts_count)
{
struct vy_range *range = vy_range_tree_first(&lsm->range_tree);
uint64_t part_run_stmts_count = new_run_stmts_count / lsm->range_count;

int W = range->slice_count + 1;
int total_stmts_count = range->count.rows + part_run_stmts_count;
return vy_task_find_range_bloom_fpr(
lsm, range, part_run_stmts_count,
W, total_stmts_count, 0);
}

static int
vy_task_dump_execute(struct vy_task *task)
{
Expand Down Expand Up @@ -1348,7 +1483,9 @@ vy_task_dump_new(struct vy_scheduler *scheduler, struct vy_worker *worker,
*/
int64_t dump_lsn = -1;
struct vy_mem *mem, *next_mem;
uint64_t dump_stmts_count = 0;
rlist_foreach_entry_safe(mem, &lsm->sealed, in_sealed, next_mem) {
dump_stmts_count += mem->count.rows;
if (mem->generation > scheduler->dump_generation)
continue;
vy_mem_wait_pinned(mem);
Expand Down Expand Up @@ -1388,6 +1525,9 @@ vy_task_dump_new(struct vy_scheduler *scheduler, struct vy_worker *worker,
* dump so we don't pass a deferred DELETE handler.
*/
struct vy_stmt_stream *wi;
/* In vy_task_compaction_new it is
* (range->compaction_priority == range->slice_count.
*/
bool is_last_level = (lsm->run_count == 0);
wi = vy_write_iterator_new(task->cmp_def, lsm->index_id == 0,
is_last_level, scheduler->read_views, NULL);
Expand All @@ -1402,7 +1542,9 @@ vy_task_dump_new(struct vy_scheduler *scheduler, struct vy_worker *worker,

task->new_run = new_run;
task->wi = wi;
task->bloom_fpr = lsm->opts.bloom_fpr;
task->bloom_fpr = vy_task_find_lsm_bloom_fpr(
lsm,
dump_stmts_count);
task->page_size = lsm->opts.page_size;

lsm->is_dumping = true;
Expand Down Expand Up @@ -1675,6 +1817,7 @@ vy_task_compaction_new(struct vy_scheduler *scheduler, struct vy_worker *worker,
struct vy_slice *slice;
int32_t dump_count = 0;
int n = range->compaction_priority;
uint64_t compacted_run_stmts_count = 0;
rlist_foreach_entry(slice, &range->slices, in_range) {
if (vy_write_iterator_new_slice(wi, slice,
lsm->disk_format) != 0)
Expand All @@ -1686,12 +1829,17 @@ vy_task_compaction_new(struct vy_scheduler *scheduler, struct vy_worker *worker,
if (task->first_slice == NULL)
task->first_slice = slice;
task->last_slice = slice;

compacted_run_stmts_count += slice->count.rows;
if (--n == 0)
break;
}
assert(n == 0);
assert(new_run->dump_lsn >= 0);

/*
* If we are working with is_last_level.
* See dump_count exploration in struct vy_run.
*/
if (range->compaction_priority == range->slice_count)
dump_count -= slice->run->dump_count;
/*
Expand All @@ -1709,7 +1857,11 @@ vy_task_compaction_new(struct vy_scheduler *scheduler, struct vy_worker *worker,
task->range = range;
task->new_run = new_run;
task->wi = wi;
task->bloom_fpr = lsm->opts.bloom_fpr;
task->bloom_fpr = vy_task_find_range_bloom_fpr(
lsm, range, compacted_run_stmts_count,
range->slice_count - range->compaction_priority + 1,
range->count.rows,
range->compaction_priority);
task->page_size = lsm->opts.page_size;

/*
Expand Down
2 changes: 2 additions & 0 deletions test/box-tap/cfg.test.lua
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ invalid('vinyl_run_count_per_level', 0)
invalid('vinyl_run_size_ratio', 1)
invalid('vinyl_bloom_fpr', 0)
invalid('vinyl_bloom_fpr', 1.1)
invalid('lookup_cost_coeff', 0)
invalid('lookup_cost_coeff', 1.1)
invalid('wal_queue_max_size', -1)

local function invalid_combinations(name, val)
Expand Down
1 change: 1 addition & 0 deletions test/sql/vinyl-opts-cfg.lua
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
--
box.cfg {
vinyl_bloom_fpr = 0.1,
vinyl_lookup_cost_coeff = 0.5,
vinyl_page_size = 32 * 1024,
vinyl_range_size = 512 * 1024 * 1024,
vinyl_run_size_ratio = 5,
Expand Down

0 comments on commit 6db4f25

Please sign in to comment.