Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Performance improvement 1: queryplans #191

Merged
merged 9 commits into from Dec 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
18 changes: 0 additions & 18 deletions .github/workflows/test_code.yml
Expand Up @@ -22,24 +22,6 @@ jobs:
run: cmake --build build --target format
- name: check formatting
run: git diff --exit-code || ( >&2 echo "Please run 'make format' to fix these issues automatically." && false )
test_clang_tidy:
name: test clang tidy
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
with:
submodules: true
- name: checkout master
run: git fetch origin master
- name: install dependencies
run: sudo apt-get install -y clang-tidy libzmq3-dev
- name: find changed files
run: git diff origin/master --name-only "*.cpp" > /tmp/filestocheck && cat /tmp/filestocheck
- name: clang-tidy
run: if [ -s /tmp/filestocheck ]; then clang-tidy $(cat /tmp/filestocheck) -checks="google-*,performance-*" -fix -- -std=c++17 -I extern/ -I extern/spdlog/include/ -I.; fi
- name: FYI git diff
run: git diff
if: ${{ always() }}
run_cmake_build:
name: run cmake build
runs-on: ubuntu-latest
Expand Down
14 changes: 2 additions & 12 deletions libursa/DatabaseSnapshot.cpp
Expand Up @@ -226,16 +226,6 @@ QueryCounters DatabaseSnapshot::execute(const Query &query,
}
}

std::unordered_set<IndexType> types_to_query;
for (const auto *ds : datasets_to_query) {
for (const auto &ndx : ds->get_indexes()) {
types_to_query.emplace(ndx.index_type());
}
}

Query query_copy{query.clone()};
query_copy.precompute(types_to_query, config);

task->spec().estimate_work(datasets_to_query.size());

QueryCounters counters;
Expand All @@ -244,7 +234,7 @@ QueryCounters DatabaseSnapshot::execute(const Query &query,
if (!ds->has_all_taints(taints)) {
continue;
}
ds->execute(query_copy, out, &counters);
ds->execute(query, out, &counters);
}
return counters;
}
Expand Down Expand Up @@ -362,7 +352,7 @@ void DatabaseSnapshot::compact_locked_datasets(Task *task) const {
// After the merging, do more plumbing to add results to the task->changes
// collection.
void DatabaseSnapshot::internal_compact(
Task *task, std::vector<const OnDiskDataset *> datasets) const {
Task *task, const std::vector<const OnDiskDataset *> &datasets) const {
std::vector<std::string> ds_names;

// There's nothing to compact
Expand Down
4 changes: 2 additions & 2 deletions libursa/DatabaseSnapshot.h
Expand Up @@ -33,8 +33,8 @@ class DatabaseSnapshot {

friend class Indexer;

void internal_compact(Task *task,
std::vector<const OnDiskDataset *> datasets) const;
void internal_compact(
Task *task, const std::vector<const OnDiskDataset *> &datasets) const;

// Indexes files with given paths. Ensures that no file will be indexed
// twice - this may be a very memory-heavy operation.
Expand Down
20 changes: 11 additions & 9 deletions libursa/OnDiskDataset.cpp
Expand Up @@ -67,24 +67,26 @@ std::string OnDiskDataset::get_file_name(FileId fid) const {
QueryResult OnDiskDataset::query(const Query &query,
QueryCounters *counters) const {
return query.run(
[this](auto &graphs, QueryCounters *counters) {
QueryResult result = QueryResult::everything();
[this](PrimitiveQuery primitive, QueryCounters *counters) {
for (auto &ndx : indices) {
if (graphs.count(ndx.index_type()) == 0) {
throw std::runtime_error("Unexpected graph type in query");
if (ndx.index_type() == primitive.itype) {
return ndx.query(primitive.trigram, counters);
}
auto subresult{
ndx.query(graphs.at(ndx.index_type()), counters)};
result.do_and(subresult, &counters->ands());
}
return result;
throw std::runtime_error("Unexpected ngram type in query");
},
counters);
}

void OnDiskDataset::execute(const Query &query, ResultWriter *out,
QueryCounters *counters) const {
QueryResult result = this->query(query, counters);
std::unordered_set<IndexType> types_to_query;
for (const auto &ndx : get_indexes()) {
types_to_query.emplace(ndx.index_type());
}
const Query plan = query.plan(types_to_query);

QueryResult result = this->query(plan, counters);
if (result.is_everything()) {
files_index->for_each_filename(
[&out](const std::string &fname) { out->push_back(fname); });
Expand Down
5 changes: 5 additions & 0 deletions libursa/OnDiskIndex.cpp
Expand Up @@ -73,6 +73,11 @@ QueryResult OnDiskIndex::query(const QueryGraph &graph,
return graph.run(oracle, counters);
}

// Returns all files with a given ngram
QueryResult OnDiskIndex::query(TriGram trigram, QueryCounters *counters) const {
return QueryResult(std::move(query_primitive(trigram, &counters->reads())));
}

std::pair<uint64_t, uint64_t> OnDiskIndex::get_run_offsets(
TriGram trigram) const {
uint64_t ptrs[2];
Expand Down
1 change: 1 addition & 0 deletions libursa/OnDiskIndex.h
Expand Up @@ -39,6 +39,7 @@ class OnDiskIndex {
const fs::path &get_fpath() const { return fpath; }
IndexType index_type() const { return ntype; }
QueryResult query(const QueryGraph &graph, QueryCounters *counters) const;
QueryResult query(TriGram trigram, QueryCounters *counters) const;
uint64_t real_size() const;
static void on_disk_merge(const fs::path &db_base, const std::string &fname,
IndexType merge_type,
Expand Down
3 changes: 3 additions & 0 deletions libursa/QString.h
Expand Up @@ -37,6 +37,9 @@ class QToken {
// Equivalent to `possible_values.size()`.
uint64_t num_possible_values() const;

// Returns true, if the QToken is unique (has only one possible value)
bool unique() const { return opts_.size() == 1; }

// Returns true, if the QToken is empty (doesn't accept any character).
bool empty() const { return opts_.empty(); }

Expand Down
110 changes: 92 additions & 18 deletions libursa/Query.cpp
Expand Up @@ -69,6 +69,14 @@ const QString &Query::as_value() const {

std::string Query::as_string_repr() const {
std::string out = "";
if (!query_plan.empty()) {
// Query is already after planning stage. Show low-level representation.
for (const auto &token : query_plan) {
out += fmt::format("[{:x}]", token.trigram);
}
return out;
}
// No query plan yet. Show stringlike representation.
for (const auto &token : value) {
if (token.num_possible_values() == 1) {
out += token.possible_values()[0];
Expand Down Expand Up @@ -207,40 +215,107 @@ QueryGraph to_query_graph(const QString &str, int size,
return result;
}

void Query::precompute(const std::unordered_set<IndexType> &types_to_query,
const DatabaseConfig &config) {
if (type == QueryType::PRIMITIVE) {
value_graphs.clear();
for (const auto &ntype : types_to_query) {
TokenValidator validator = get_validator_for(ntype);
size_t input_len = get_ngram_size_for(ntype);
auto graph{to_query_graph(value, input_len, config, validator)};
value_graphs.emplace(ntype, std::move(graph));
// For primitive queries, find a minimal covering set of ngram queries and
// return it. If there are multiple disconnected components, AND them.
// For example, "abcde\x??efg" will return abcd & bcde & efg
std::vector<PrimitiveQuery> plan_qstring(
const std::unordered_set<IndexType> &types_to_query, const QString &value) {
std::vector<PrimitiveQuery> plan;

bool has_gram3 = types_to_query.count(IndexType::GRAM3) != 0;
bool has_text4 = types_to_query.count(IndexType::TEXT4) != 0;
bool has_wide8 = types_to_query.count(IndexType::WIDE8) != 0;
bool has_hash4 = types_to_query.count(IndexType::HASH4) != 0;

// `i` is the current index. `skip_to` is used to keep track of last
// "handled" byte. For example there's no point in adding 3gram "bcd"
// when 4gram "abcd" was already added. Only relevant for wide8 ngrams.
int i = 0;
int skip_to = 0;
while (i + 2 < value.size()) {
// If wide8 index is supported, try to add a token and skip 6 bytes.
if (has_wide8) {
if (const auto &gram = convert_gram(IndexType::WIDE8, i, value)) {
plan.emplace_back(IndexType::WIDE8, *gram);
skip_to = i + 6;
i += 2;
continue;
}
}
} else {
for (auto &query : queries) {
query.precompute(types_to_query, config);
// If text4 index is supported, try to add a token and skip 2 bytes.
if (has_text4) {
if (const auto &gram = convert_gram(IndexType::TEXT4, i, value)) {
plan.emplace_back(IndexType::TEXT4, *gram);
skip_to = i + 2;
i += 1;
continue;
}
}
// If hash4 index is supported and current ngram is not text, try hash4.
const auto &hgram = convert_gram(IndexType::HASH4, i, value);
if (i >= (skip_to - 1) && has_hash4 && hgram) {
plan.emplace_back(IndexType::HASH4, *hgram);
// Don't continue here - gram3 can give us more information.
}
// Otherwise, add a regular gram3 token.
const auto &gram = convert_gram(IndexType::GRAM3, i, value);
if (i >= skip_to && gram) {
if (has_gram3) {
plan.emplace_back(IndexType::GRAM3, *gram);
}
i += 1;
continue;
}
// If no ngram can be added, remember to move forward.
i += 1;
}

return std::move(plan);
}

Query Query::plan(const std::unordered_set<IndexType> &types_to_query) const {
if (type != QueryType::PRIMITIVE) {
std::vector<Query> plans;
for (const auto &query : queries) {
plans.emplace_back(query.plan(types_to_query));
}
if (type == QueryType::MIN_OF) {
return Query(count, std::move(plans));
}
return Query(type, std::move(plans));
}

return Query(plan_qstring(types_to_query, value));
}

QueryResult Query::run(const QueryPrimitive &primitive,
QueryCounters *counters) const {
if (type == QueryType::PRIMITIVE) {
return primitive(value_graphs, counters);
} else if (type == QueryType::AND) {
auto result = QueryResult::everything();
for (const auto &token : query_plan) {
auto next = primitive(token, counters);
result.do_and(next, &counters->ands());
if (result.is_empty()) {
break;
}
}
return result;
}
if (type == QueryType::AND) {
auto result = QueryResult::everything();
for (const auto &query : queries) {
result.do_and(query.run(primitive, counters), &counters->ands());
}
return result;
} else if (type == QueryType::OR) {
}
if (type == QueryType::OR) {
auto result = QueryResult::empty();
for (const auto &query : queries) {
result.do_or(query.run(primitive, counters), &counters->ors());
}
return result;
} else if (type == QueryType::MIN_OF) {
}
if (type == QueryType::MIN_OF) {
std::vector<QueryResult> results;
std::vector<const QueryResult *> results_ptrs;
results.reserve(queries.size());
Expand All @@ -250,7 +325,6 @@ QueryResult Query::run(const QueryPrimitive &primitive,
results_ptrs.emplace_back(&results.back());
}
return QueryResult::do_min_of(count, results_ptrs, &counters->minofs());
} else {
throw std::runtime_error("Unexpected query type");
}
throw std::runtime_error("Unexpected query type");
}
37 changes: 30 additions & 7 deletions libursa/Query.h
Expand Up @@ -15,13 +15,32 @@

enum class QueryType { PRIMITIVE = 1, AND = 2, OR = 3, MIN_OF = 4 };

using QueryPrimitive = std::function<QueryResult(
const std::unordered_map<IndexType, QueryGraph> &, QueryCounters *counter)>;
// Small utility class to represent a ngram along with its type.
// This is different to the TriGram typedef, because TriGram doesn't know what
// type of index it represents.
class PrimitiveQuery {
public:
PrimitiveQuery(IndexType itype, TriGram trigram)
: itype(itype), trigram(trigram) {}

const IndexType itype;
const TriGram trigram;
};

using QueryPrimitive =
std::function<QueryResult(PrimitiveQuery, QueryCounters *counter)>;

// Query represents the query as provided by the user.
// Query can contain subqueries (using AND/OR/MINOF) or be a literal query.
// There are actually two types of literal query objects - "plain" and
// "planned". All queries start as plain - represented by QString. They are
// independent of the database. Before actually running them, they must be
// planned (using a plan() method). At this point query decides which ngrams
// will actually be checked.
class Query {
private:
Query(const Query &other)
: type(other.type), value_graphs(), count(other.count) {
: type(other.type), query_plan(), count(other.count) {
queries.reserve(other.queries.size());
for (const auto &query : other.queries) {
queries.emplace_back(query.clone());
Expand All @@ -32,6 +51,11 @@ class Query {
}
}

explicit Query(std::vector<PrimitiveQuery> &&query_plan)
: type(QueryType::PRIMITIVE),
query_plan(std::move(query_plan)),
value() {}

public:
explicit Query(QString &&qstr);
explicit Query(uint32_t count, std::vector<Query> &&queries);
Expand All @@ -47,16 +71,15 @@ class Query {

QueryResult run(const QueryPrimitive &primitive,
QueryCounters *counters) const;
void precompute(const std::unordered_set<IndexType> &types_to_query,
const DatabaseConfig &config);
Query plan(const std::unordered_set<IndexType> &types_to_query) const;

Query clone() const { return Query(*this); }

private:
QueryType type;
// used for QueryType::PRIMITIVE
QString value;
std::unordered_map<IndexType, QueryGraph> value_graphs;
QString value; // before plan()
std::vector<PrimitiveQuery> query_plan; // after plan()
// used for QueryType::MIN_OF
uint32_t count;
// used for QueryType::AND/OR/MIN_OF
Expand Down
16 changes: 16 additions & 0 deletions libursa/Utils.cpp
Expand Up @@ -103,6 +103,22 @@ std::optional<TriGram> convert_gram(IndexType type, uint64_t source) {
return std::make_optional(result[0]);
}

std::optional<TriGram> convert_gram(IndexType type, int index,
const QString &string) {
int size = get_ngram_size_for(type);
if (index + size > string.size()) {
return std::nullopt;
}
uint64_t source = 0;
for (int i = 0; i < size; i++) {
if (!string[index + i].unique()) {
return std::nullopt;
}
source = (source << 8) | string[index + i].possible_values()[0];
}
return convert_gram(type, source);
}

void gen_b64grams(const uint8_t *mem, uint64_t size,
const TrigramCallback &cb) {
if (size < 4) {
Expand Down
2 changes: 2 additions & 0 deletions libursa/Utils.h
Expand Up @@ -46,6 +46,8 @@ void combinations(const QString &qstr, size_t len, const TrigramGenerator &gen,

// Converts ngram from raw representation to compressed 3byte id.
std::optional<TriGram> convert_gram(IndexType type, uint64_t source);
std::optional<TriGram> convert_gram(IndexType type, int index,
const QString &string);

template <TrigramGenerator gen>
std::vector<TriGram> get_trigrams_eager(const uint8_t *mem, size_t size) {
Expand Down
2 changes: 1 addition & 1 deletion teste2e/test_indexing.py
Expand Up @@ -98,7 +98,7 @@ def test_wide8_index_works_as_expected(ursadb: UrsadbTestContext):
check_query(
ursadb,
"{61 (00|01) (62|63) (00|01) (63|62) (00|01) 64 00}",
["vvv", "qqq"],
["kot", "zzz", "yyy", "vvv", "qqq"],
)
assert get_index_hash(ursadb, "wide8")[:16] == "c73b55c36445ca6b"

Expand Down