Skip to content

Commit

Permalink
Merge pull request #9261 from CurtizJ/improve-index-in
Browse files Browse the repository at this point in the history
Improve performance of analysing index with IN
  • Loading branch information
alexey-milovidov committed Mar 4, 2020
2 parents 4629f78 + e21971c commit 1d51ad0
Show file tree
Hide file tree
Showing 6 changed files with 246 additions and 35 deletions.
94 changes: 62 additions & 32 deletions dbms/src/Interpreters/Set.cpp
Expand Up @@ -458,9 +458,19 @@ MergeTreeSetIndex::MergeTreeSetIndex(const Columns & set_elements, std::vector<K

size_t tuple_size = indexes_mapping.size();
ordered_set.resize(tuple_size);

/// Create columns for points here to avoid extra allocations at 'checkInRange'.
left_point.reserve(tuple_size);
right_point.reserve(tuple_size);

for (size_t i = 0; i < tuple_size; ++i)
{
ordered_set[i] = set_elements[indexes_mapping[i].tuple_index];

left_point.emplace_back(ordered_set[i]->cloneEmpty());
right_point.emplace_back(ordered_set[i]->cloneEmpty());
}

Block block_to_sort;
SortDescription sort_description;
for (size_t i = 0; i < tuple_size; ++i)
Expand All @@ -484,13 +494,6 @@ BoolMask MergeTreeSetIndex::checkInRange(const std::vector<Range> & key_ranges,
{
size_t tuple_size = indexes_mapping.size();

using FieldWithInfinityTuple = std::vector<FieldWithInfinity>;

FieldWithInfinityTuple left_point;
FieldWithInfinityTuple right_point;
left_point.reserve(tuple_size);
right_point.reserve(tuple_size);

bool invert_left_infinities = false;
bool invert_right_infinities = false;

Expand All @@ -512,66 +515,93 @@ BoolMask MergeTreeSetIndex::checkInRange(const std::vector<Range> & key_ranges,
if (!new_range->left_included)
invert_left_infinities = true;

left_point.push_back(FieldWithInfinity(new_range->left));
left_point[i].update(new_range->left);
}
else
{
if (invert_left_infinities)
left_point.push_back(FieldWithInfinity::getPlusinfinity());
left_point[i].update(ValueWithInfinity::PLUS_INFINITY);
else
left_point.push_back(FieldWithInfinity::getMinusInfinity());
left_point[i].update(ValueWithInfinity::MINUS_INFINITY);
}

if (new_range->right_bounded)
{
if (!new_range->right_included)
invert_right_infinities = true;

right_point.push_back(FieldWithInfinity(new_range->right));
right_point[i].update(new_range->right);
}
else
{
if (invert_right_infinities)
right_point.push_back(FieldWithInfinity::getMinusInfinity());
right_point[i].update(ValueWithInfinity::MINUS_INFINITY);
else
right_point.push_back(FieldWithInfinity::getPlusinfinity());
right_point[i].update(ValueWithInfinity::PLUS_INFINITY);
}
}

/// This allows to construct tuple in 'ordered_set' at specified index for comparison with range.
auto compare = [](const IColumn & lhs, const ValueWithInfinity & rhs, size_t row)
{
auto type = rhs.getType();
/// Return inverted infinity sign, because in 'lhs' all values are finite.
if (type != ValueWithInfinity::NORMAL)
return -static_cast<int>(type);

auto indices = ext::range(0, ordered_set.at(0)->size());
return lhs.compareAt(row, 0, rhs.getColumnIfFinite(), 1);
};

auto extract_tuple = [tuple_size, this](size_t i)
auto less = [this, &compare, tuple_size](size_t row, const auto & point)
{
/// Inefficient.
FieldWithInfinityTuple res;
res.reserve(tuple_size);
for (size_t j = 0; j < tuple_size; ++j)
res.emplace_back((*ordered_set[j])[i]);
return res;
for (size_t i = 0; i < tuple_size; ++i)
{
int res = compare(*ordered_set[i], point[i], row);
if (res)
return res < 0;
}
return false;
};

auto compare = [&extract_tuple](size_t i, const FieldWithInfinityTuple & rhs)
auto equals = [this, &compare, tuple_size](size_t row, const auto & point)
{
return extract_tuple(i) < rhs;
for (size_t i = 0; i < tuple_size; ++i)
if (compare(*ordered_set[i], point[i], row) != 0)
return false;
return true;
};

/** Because each parallelogram maps to a contiguous sequence of elements
* layed out in the lexicographically increasing order, the set intersects the range
* if and only if either bound coincides with an element or at least one element
* is between the lower bounds
*/
auto left_lower = std::lower_bound(indices.begin(), indices.end(), left_point, compare);
auto right_lower = std::lower_bound(indices.begin(), indices.end(), right_point, compare);
* layed out in the lexicographically increasing order, the set intersects the range
* if and only if either bound coincides with an element or at least one element
* is between the lower bounds
*/
auto indices = ext::range(0, size());
auto left_lower = std::lower_bound(indices.begin(), indices.end(), left_point, less);
auto right_lower = std::lower_bound(indices.begin(), indices.end(), right_point, less);

return
{
left_lower != right_lower
|| (left_lower != indices.end() && extract_tuple(*left_lower) == left_point)
|| (right_lower != indices.end() && extract_tuple(*right_lower) == right_point),
|| (left_lower != indices.end() && equals(*left_lower, left_point))
|| (right_lower != indices.end() && equals(*right_lower, right_point)),
true
};
}

void ValueWithInfinity::update(const Field & x)
{
/// Keep at most one element in column.
if (!column->empty())
column->popBack(1);
column->insert(x);
type = NORMAL;
}

const IColumn & ValueWithInfinity::getColumnIfFinite() const
{
if (type != NORMAL)
throw Exception("Trying to get column of infinite type", ErrorCodes::LOGICAL_ERROR);
return *column;
}

}
36 changes: 35 additions & 1 deletion dbms/src/Interpreters/Set.h
Expand Up @@ -16,7 +16,6 @@ namespace DB
{

struct Range;
class FieldWithInfinity;

class IFunctionBase;
using FunctionBasePtr = std::shared_ptr<IFunctionBase>;
Expand Down Expand Up @@ -180,6 +179,36 @@ using Sets = std::vector<SetPtr>;
class IFunction;
using FunctionPtr = std::shared_ptr<IFunction>;

/** Class that represents single value with possible infinities.
* Single field is stored in column for more optimal inplace comparisons with other regular columns.
* Extracting fields from columns and further their comparison is suboptimal and requires extra copying.
*/
class ValueWithInfinity
{
public:
enum Type
{
MINUS_INFINITY = -1,
NORMAL = 0,
PLUS_INFINITY = 1
};

ValueWithInfinity(MutableColumnPtr && column_)
: column(std::move(column_)), type(NORMAL) {}

void update(const Field & x);
void update(Type type_) { type = type_; }

const IColumn & getColumnIfFinite() const;

Type getType() const { return type; }

private:
MutableColumnPtr column;
Type type;
};


/// Class for checkInRange function.
class MergeTreeSetIndex
{
Expand All @@ -203,6 +232,11 @@ class MergeTreeSetIndex
private:
Columns ordered_set;
std::vector<KeyTuplePositionMapping> indexes_mapping;

using ColumnsWithInfinity = std::vector<ValueWithInfinity>;

ColumnsWithInfinity left_point;
ColumnsWithInfinity right_point;
};

}
114 changes: 114 additions & 0 deletions dbms/src/Interpreters/tests/gtest_merge_tree_set_index.cpp
@@ -0,0 +1,114 @@
#include <Interpreters/Set.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeString.h>
#include <Storages/MergeTree/KeyCondition.h>

#include <gtest/gtest.h>

using namespace DB;

TEST(MergeTreeSetIndex, checkInRange_one)
{
DataTypes types = {std::make_shared<const DataTypeInt64>()};

auto mut = types[0]->createColumn();
mut->insert(1);
mut->insert(5);
mut->insert(7);

Columns columns = {std::move(mut)};

std::vector<MergeTreeSetIndex::KeyTuplePositionMapping> mapping = {{0, 0, {}}};
auto set = std::make_unique<MergeTreeSetIndex>(columns, std::move(mapping));

// Left and right bounded
std::vector<Range> ranges = {Range(1, true, 4, true)};
ASSERT_EQ(set->checkInRange(ranges, types).can_be_true, true) << "(1, 4)";

ranges = {Range(2, true, 4, true)};
ASSERT_EQ(set->checkInRange(ranges, types).can_be_true, false) << "(2, 4)";

ranges = {Range(-1, true, 0, true)};
ASSERT_EQ(set->checkInRange(ranges, types).can_be_true, false) << "(-1, 0)";

ranges = {Range(-1, true, 10, true)};
ASSERT_EQ(set->checkInRange(ranges, types).can_be_true, true) << "(-1, 10)";

// Left bounded
ranges = {Range::createLeftBounded(1, true)};
ASSERT_EQ(set->checkInRange(ranges, types).can_be_true, true) << "(1, +inf)";

ranges = {Range::createLeftBounded(-1, true)};
ASSERT_EQ(set->checkInRange(ranges, types).can_be_true, true) << "(-1, +inf)";

ranges = {Range::createLeftBounded(10, true)};
ASSERT_EQ(set->checkInRange(ranges, types).can_be_true, false) << "(10, +inf)";

// Right bounded
ranges = {Range::createRightBounded(1, true)};
ASSERT_EQ(set->checkInRange(ranges, types).can_be_true, true) << "(-inf, 1)";

ranges = {Range::createRightBounded(-1, true)};
ASSERT_EQ(set->checkInRange(ranges, types).can_be_true, false) << "(-inf, -1)";

ranges = {Range::createRightBounded(10, true)};
ASSERT_EQ(set->checkInRange(ranges, types).can_be_true, true) << "(-inf, 10)";
}

TEST(MergeTreeSetIndex, checkInRange_tuple)
{
DataTypes types = {std::make_shared<const DataTypeUInt64>(), std::make_shared<const DataTypeString>()};

Columns columns;
{
auto values = {1, 1, 3, 3, 3, 10};
auto mut = types[0]->createColumn();
for (auto & val : values)
mut->insert(val);
columns.push_back(std::move(mut));
}

{
auto values = {"a", "b", "a", "a", "b", "c"};
auto mut = types[1]->createColumn();
for (auto & val : values)
mut->insert(val);
columns.push_back(std::move(mut));
}

std::vector<MergeTreeSetIndex::KeyTuplePositionMapping> mapping = {{0, 0, {}}, {1, 1, {}}};
auto set = std::make_unique<MergeTreeSetIndex>(columns, std::move(mapping));

std::vector<Range> ranges = {Range(1), Range("a", true, "c", true)};
ASSERT_EQ(set->checkInRange(ranges, types).can_be_true, true) << "Range(1), Range('a', true, 'c', true)";

ranges = {Range(1, false, 3, false), Range()};
ASSERT_EQ(set->checkInRange(ranges, types).can_be_true, false) << "Range(1, false, 3, false), Range()";

ranges = {Range(2, false, 5, false), Range()};
ASSERT_EQ(set->checkInRange(ranges, types).can_be_true, true) << "Range(2, false, 5, false), Range()";

ranges = {Range(3), Range::createLeftBounded("a", true)};
ASSERT_EQ(set->checkInRange(ranges, types).can_be_true, true) << "Range(3), Range::createLeftBounded('a', true)";

ranges = {Range(3), Range::createLeftBounded("f", true)};
ASSERT_EQ(set->checkInRange(ranges, types).can_be_true, false) << "Range(3), Range::createLeftBounded('f', true)";

ranges = {Range(3), Range::createRightBounded("a", true)};
ASSERT_EQ(set->checkInRange(ranges, types).can_be_true, true) << "Range(3), Range::createRightBounded('a', true)";

ranges = {Range(3), Range::createRightBounded("b", true)};
ASSERT_EQ(set->checkInRange(ranges, types).can_be_true, true) << "Range(3), Range::createRightBounded('b', true)";

ranges = {Range(1), Range("b")};
ASSERT_EQ(set->checkInRange(ranges, types).can_be_true, true) << "Range(1), Range('b')";

ranges = {Range(1), Range("c")};
ASSERT_EQ(set->checkInRange(ranges, types).can_be_true, false) << "Range(1), Range('c')";

ranges = {Range(2, true, 3, true), Range()};
ASSERT_EQ(set->checkInRange(ranges, types).can_be_true, true) << "Range(2, true, 3, true), Range('x', true, 'z', true)";

ranges = {Range(2), Range("a", true, "z", true)};
ASSERT_EQ(set->checkInRange(ranges, types).can_be_true, false) << "Range(2, true, 3, true), Range('c', true, 'z', true)";
}
2 changes: 1 addition & 1 deletion dbms/src/Storages/MergeTree/KeyCondition.cpp
Expand Up @@ -362,7 +362,7 @@ FieldWithInfinity FieldWithInfinity::getMinusInfinity()
return FieldWithInfinity(Type::MINUS_INFINITY);
}

FieldWithInfinity FieldWithInfinity::getPlusinfinity()
FieldWithInfinity FieldWithInfinity::getPlusInfinity()
{
return FieldWithInfinity(Type::PLUS_INFINITY);
}
Expand Down
14 changes: 13 additions & 1 deletion dbms/src/Storages/MergeTree/KeyCondition.h
Expand Up @@ -15,6 +15,11 @@
namespace DB
{

namespace ErrorCodes
{
extern const int BAD_TYPE_OF_FIELD;
}

class IFunction;
using FunctionBasePtr = std::shared_ptr<IFunctionBase>;

Expand Down Expand Up @@ -206,11 +211,18 @@ class FieldWithInfinity
FieldWithInfinity(Field && field_);

static FieldWithInfinity getMinusInfinity();
static FieldWithInfinity getPlusinfinity();
static FieldWithInfinity getPlusInfinity();

bool operator<(const FieldWithInfinity & other) const;
bool operator==(const FieldWithInfinity & other) const;

Field getFieldIfFinite() const
{
if (type != NORMAL)
throw Exception("Trying to get field of infinite type", ErrorCodes::BAD_TYPE_OF_FIELD);
return field;
}

private:
Field field;
Type type;
Expand Down
21 changes: 21 additions & 0 deletions dbms/tests/performance/set_index.xml
@@ -0,0 +1,21 @@
<test>
<type>loop</type>

<create_query>CREATE TABLE test_in (`a` UInt32) ENGINE = MergeTree() ORDER BY a</create_query>
<fill_query>INSERT INTO test_in SELECT number FROM numbers(500000000)</fill_query>

<stop_conditions>
<all_of>
<total_time_ms>8000</total_time_ms>
</all_of>
<any_of>
<min_time_not_changing_for_ms>7000</min_time_not_changing_for_ms>
<total_time_ms>20000</total_time_ms>
</any_of>
</stop_conditions>


<query>SELECT count() FROM test_in WHERE a IN (SELECT rand(1) FROM numbers(100000)) SETTINGS max_rows_to_read = 1, read_overflow_mode = 'break'</query>

<drop_query>DROP TABLE IF EXISTS test_in</drop_query>
</test>

0 comments on commit 1d51ad0

Please sign in to comment.