CERT-Polska · msm-code · Dec 14, 2022 · Dec 13, 2022 · Dec 13, 2022 · Dec 13, 2022
diff --git a/.github/workflows/test_code.yml b/.github/workflows/test_code.yml
@@ -22,24 +22,6 @@ jobs:
       run: cmake --build build --target format
     - name: check formatting
       run: git diff --exit-code || ( >&2 echo "Please run 'make format' to fix these issues automatically." && false )
-  test_clang_tidy:
-    name: test clang tidy
-    runs-on: ubuntu-latest
-    steps:
-    - uses: actions/checkout@v2
-      with:
-        submodules: true
-    - name: checkout master
-      run: git fetch origin master
-    - name: install dependencies
-      run: sudo apt-get install -y clang-tidy libzmq3-dev
-    - name: find changed files
-      run: git diff origin/master --name-only "*.cpp" > /tmp/filestocheck && cat /tmp/filestocheck
-    - name: clang-tidy
-      run: if [ -s /tmp/filestocheck ]; then clang-tidy $(cat /tmp/filestocheck) -checks="google-*,performance-*" -fix -- -std=c++17 -I extern/ -I extern/spdlog/include/ -I.; fi
-    - name: FYI git diff
-      run: git diff
-      if: ${{ always() }}
   run_cmake_build:
     name: run cmake build
     runs-on: ubuntu-latest

diff --git a/libursa/DatabaseSnapshot.cpp b/libursa/DatabaseSnapshot.cpp
@@ -226,16 +226,6 @@ QueryCounters DatabaseSnapshot::execute(const Query &query,
         }
     }
 
-    std::unordered_set<IndexType> types_to_query;
-    for (const auto *ds : datasets_to_query) {
-        for (const auto &ndx : ds->get_indexes()) {
-            types_to_query.emplace(ndx.index_type());
-        }
-    }
-
-    Query query_copy{query.clone()};
-    query_copy.precompute(types_to_query, config);
-
     task->spec().estimate_work(datasets_to_query.size());
 
     QueryCounters counters;
@@ -244,7 +234,7 @@ QueryCounters DatabaseSnapshot::execute(const Query &query,
         if (!ds->has_all_taints(taints)) {
             continue;
         }
-        ds->execute(query_copy, out, &counters);
+        ds->execute(query, out, &counters);
     }
     return counters;
 }
@@ -362,7 +352,7 @@ void DatabaseSnapshot::compact_locked_datasets(Task *task) const {
 // After the merging, do more plumbing to add results to the task->changes
 // collection.
 void DatabaseSnapshot::internal_compact(
-    Task *task, std::vector<const OnDiskDataset *> datasets) const {
+    Task *task, const std::vector<const OnDiskDataset *> &datasets) const {
     std::vector<std::string> ds_names;
 
     // There's nothing to compact

diff --git a/libursa/DatabaseSnapshot.h b/libursa/DatabaseSnapshot.h
@@ -33,8 +33,8 @@ class DatabaseSnapshot {
 
     friend class Indexer;
 
-    void internal_compact(Task *task,
-                          std::vector<const OnDiskDataset *> datasets) const;
+    void internal_compact(
+        Task *task, const std::vector<const OnDiskDataset *> &datasets) const;
 
     // Indexes files with given paths. Ensures that no file will be indexed
     // twice - this may be a very memory-heavy operation.

diff --git a/libursa/OnDiskDataset.cpp b/libursa/OnDiskDataset.cpp
@@ -67,24 +67,26 @@ std::string OnDiskDataset::get_file_name(FileId fid) const {
 QueryResult OnDiskDataset::query(const Query &query,
                                  QueryCounters *counters) const {
     return query.run(
-        [this](auto &graphs, QueryCounters *counters) {
-            QueryResult result = QueryResult::everything();
+        [this](PrimitiveQuery primitive, QueryCounters *counters) {
             for (auto &ndx : indices) {
-                if (graphs.count(ndx.index_type()) == 0) {
-                    throw std::runtime_error("Unexpected graph type in query");
+                if (ndx.index_type() == primitive.itype) {
+                    return ndx.query(primitive.trigram, counters);
                 }
-                auto subresult{
-                    ndx.query(graphs.at(ndx.index_type()), counters)};
-                result.do_and(subresult, &counters->ands());
             }
-            return result;
+            throw std::runtime_error("Unexpected ngram type in query");
         },
         counters);
 }
 
 void OnDiskDataset::execute(const Query &query, ResultWriter *out,
                             QueryCounters *counters) const {
-    QueryResult result = this->query(query, counters);
+    std::unordered_set<IndexType> types_to_query;
+    for (const auto &ndx : get_indexes()) {
+        types_to_query.emplace(ndx.index_type());
+    }
+    const Query plan = query.plan(types_to_query);
+
+    QueryResult result = this->query(plan, counters);
     if (result.is_everything()) {
         files_index->for_each_filename(
             [&out](const std::string &fname) { out->push_back(fname); });

diff --git a/libursa/OnDiskIndex.cpp b/libursa/OnDiskIndex.cpp
@@ -73,6 +73,11 @@ QueryResult OnDiskIndex::query(const QueryGraph &graph,
     return graph.run(oracle, counters);
 }
 
+// Returns all files with a given ngram
+QueryResult OnDiskIndex::query(TriGram trigram, QueryCounters *counters) const {
+    return QueryResult(std::move(query_primitive(trigram, &counters->reads())));
+}
+
 std::pair<uint64_t, uint64_t> OnDiskIndex::get_run_offsets(
     TriGram trigram) const {
     uint64_t ptrs[2];

diff --git a/libursa/OnDiskIndex.h b/libursa/OnDiskIndex.h
@@ -39,6 +39,7 @@ class OnDiskIndex {
     const fs::path &get_fpath() const { return fpath; }
     IndexType index_type() const { return ntype; }
     QueryResult query(const QueryGraph &graph, QueryCounters *counters) const;
+    QueryResult query(TriGram trigram, QueryCounters *counters) const;
     uint64_t real_size() const;
     static void on_disk_merge(const fs::path &db_base, const std::string &fname,
                               IndexType merge_type,

diff --git a/libursa/QString.h b/libursa/QString.h
@@ -37,6 +37,9 @@ class QToken {
     // Equivalent to `possible_values.size()`.
     uint64_t num_possible_values() const;
 
+    // Returns true, if the QToken is unique (has only one possible value)
+    bool unique() const { return opts_.size() == 1; }
+
     // Returns true, if the QToken is empty (doesn't accept any character).
     bool empty() const { return opts_.empty(); }
 

diff --git a/libursa/Query.cpp b/libursa/Query.cpp
@@ -69,6 +69,14 @@ const QString &Query::as_value() const {
 
 std::string Query::as_string_repr() const {
     std::string out = "";
+    if (!query_plan.empty()) {
+        // Query is already after planning stage. Show low-level representation.
+        for (const auto &token : query_plan) {
+            out += fmt::format("[{:x}]", token.trigram);
+        }
+        return out;
+    }
+    // No query plan yet. Show stringlike representation.
     for (const auto &token : value) {
         if (token.num_possible_values() == 1) {
             out += token.possible_values()[0];
@@ -207,40 +215,107 @@ QueryGraph to_query_graph(const QString &str, int size,
     return result;
 }
 
-void Query::precompute(const std::unordered_set<IndexType> &types_to_query,
-                       const DatabaseConfig &config) {
-    if (type == QueryType::PRIMITIVE) {
-        value_graphs.clear();
-        for (const auto &ntype : types_to_query) {
-            TokenValidator validator = get_validator_for(ntype);
-            size_t input_len = get_ngram_size_for(ntype);
-            auto graph{to_query_graph(value, input_len, config, validator)};
-            value_graphs.emplace(ntype, std::move(graph));
+// For primitive queries, find a minimal covering set of ngram queries and
+// return it. If there are multiple disconnected components, AND them.
+// For example, "abcde\x??efg" will return abcd & bcde & efg
+std::vector<PrimitiveQuery> plan_qstring(
+    const std::unordered_set<IndexType> &types_to_query, const QString &value) {
+    std::vector<PrimitiveQuery> plan;
+
+    bool has_gram3 = types_to_query.count(IndexType::GRAM3) != 0;
+    bool has_text4 = types_to_query.count(IndexType::TEXT4) != 0;
+    bool has_wide8 = types_to_query.count(IndexType::WIDE8) != 0;
+    bool has_hash4 = types_to_query.count(IndexType::HASH4) != 0;
+
+    // `i` is the current index. `skip_to` is used to keep track of last
+    // "handled" byte. For example there's no point in adding 3gram "bcd"
+    // when 4gram "abcd" was already added. Only relevant for wide8 ngrams.
+    int i = 0;
+    int skip_to = 0;
+    while (i + 2 < value.size()) {
+        // If wide8 index is supported, try to add a token and skip 6 bytes.
+        if (has_wide8) {
+            if (const auto &gram = convert_gram(IndexType::WIDE8, i, value)) {
+                plan.emplace_back(IndexType::WIDE8, *gram);
+                skip_to = i + 6;
+                i += 2;
+                continue;
+            }
         }
-    } else {
-        for (auto &query : queries) {
-            query.precompute(types_to_query, config);
+        // If text4 index is supported, try to add a token and skip 2 bytes.
+        if (has_text4) {
+            if (const auto &gram = convert_gram(IndexType::TEXT4, i, value)) {
+                plan.emplace_back(IndexType::TEXT4, *gram);
+                skip_to = i + 2;
+                i += 1;
+                continue;
+            }
+        }
+        // If hash4 index is supported and current ngram is not text, try hash4.
+        const auto &hgram = convert_gram(IndexType::HASH4, i, value);
+        if (i >= (skip_to - 1) && has_hash4 && hgram) {
+            plan.emplace_back(IndexType::HASH4, *hgram);
+            // Don't continue here - gram3 can give us more information.
+        }
+        // Otherwise, add a regular gram3 token.
+        const auto &gram = convert_gram(IndexType::GRAM3, i, value);
+        if (i >= skip_to && gram) {
+            if (has_gram3) {
+                plan.emplace_back(IndexType::GRAM3, *gram);
+            }
+            i += 1;
+            continue;
+        }
+        // If no ngram can be added, remember to move forward.
+        i += 1;
+    }
+
+    return std::move(plan);
+}
+
+Query Query::plan(const std::unordered_set<IndexType> &types_to_query) const {
+    if (type != QueryType::PRIMITIVE) {
+        std::vector<Query> plans;
+        for (const auto &query : queries) {
+            plans.emplace_back(query.plan(types_to_query));
+        }
+        if (type == QueryType::MIN_OF) {
+            return Query(count, std::move(plans));
         }
+        return Query(type, std::move(plans));
     }
+
+    return Query(plan_qstring(types_to_query, value));
 }
 
 QueryResult Query::run(const QueryPrimitive &primitive,
                        QueryCounters *counters) const {
     if (type == QueryType::PRIMITIVE) {
-        return primitive(value_graphs, counters);
-    } else if (type == QueryType::AND) {
+        auto result = QueryResult::everything();
+        for (const auto &token : query_plan) {
+            auto next = primitive(token, counters);
+            result.do_and(next, &counters->ands());
+            if (result.is_empty()) {
+                break;
+            }
+        }
+        return result;
+    }
+    if (type == QueryType::AND) {
         auto result = QueryResult::everything();
         for (const auto &query : queries) {
             result.do_and(query.run(primitive, counters), &counters->ands());
         }
         return result;
-    } else if (type == QueryType::OR) {
+    }
+    if (type == QueryType::OR) {
         auto result = QueryResult::empty();
         for (const auto &query : queries) {
             result.do_or(query.run(primitive, counters), &counters->ors());
         }
         return result;
-    } else if (type == QueryType::MIN_OF) {
+    }
+    if (type == QueryType::MIN_OF) {
         std::vector<QueryResult> results;
         std::vector<const QueryResult *> results_ptrs;
         results.reserve(queries.size());
@@ -250,7 +325,6 @@ QueryResult Query::run(const QueryPrimitive &primitive,
             results_ptrs.emplace_back(&results.back());
         }
         return QueryResult::do_min_of(count, results_ptrs, &counters->minofs());
-    } else {
-        throw std::runtime_error("Unexpected query type");
     }
+    throw std::runtime_error("Unexpected query type");
 }
diff --git a/libursa/Query.h b/libursa/Query.h
@@ -15,13 +15,32 @@
 
 enum class QueryType { PRIMITIVE = 1, AND = 2, OR = 3, MIN_OF = 4 };
 
-using QueryPrimitive = std::function<QueryResult(
-    const std::unordered_map<IndexType, QueryGraph> &, QueryCounters *counter)>;
+// Small utility class to represent a ngram along with its type.
+// This is different to the TriGram typedef, because TriGram doesn't know what
+// type of index it represents.
+class PrimitiveQuery {
+   public:
+    PrimitiveQuery(IndexType itype, TriGram trigram)
+        : itype(itype), trigram(trigram) {}
+
+    const IndexType itype;
+    const TriGram trigram;
+};
 
+using QueryPrimitive =
+    std::function<QueryResult(PrimitiveQuery, QueryCounters *counter)>;
+
+// Query represents the query as provided by the user.
+// Query can contain subqueries (using AND/OR/MINOF) or be a literal query.
+// There are actually two types of literal query objects - "plain" and
+// "planned". All queries start as plain - represented by QString. They are
+// independent of the database. Before actually running them, they must be
+// planned (using a plan() method). At this point query decides which ngrams
+// will actually be checked.
 class Query {
    private:
     Query(const Query &other)
-        : type(other.type), value_graphs(), count(other.count) {
+        : type(other.type), query_plan(), count(other.count) {
         queries.reserve(other.queries.size());
         for (const auto &query : other.queries) {
             queries.emplace_back(query.clone());
@@ -32,6 +51,11 @@ class Query {
         }
     }
 
+    explicit Query(std::vector<PrimitiveQuery> &&query_plan)
+        : type(QueryType::PRIMITIVE),
+          query_plan(std::move(query_plan)),
+          value() {}
+
    public:
     explicit Query(QString &&qstr);
     explicit Query(uint32_t count, std::vector<Query> &&queries);
@@ -47,16 +71,15 @@ class Query {
 
     QueryResult run(const QueryPrimitive &primitive,
                     QueryCounters *counters) const;
-    void precompute(const std::unordered_set<IndexType> &types_to_query,
-                    const DatabaseConfig &config);
+    Query plan(const std::unordered_set<IndexType> &types_to_query) const;
 
     Query clone() const { return Query(*this); }
 
    private:
     QueryType type;
     // used for QueryType::PRIMITIVE
-    QString value;
-    std::unordered_map<IndexType, QueryGraph> value_graphs;
+    QString value;                           // before plan()
+    std::vector<PrimitiveQuery> query_plan;  // after plan()
     // used for QueryType::MIN_OF
     uint32_t count;
     // used for QueryType::AND/OR/MIN_OF

diff --git a/libursa/Utils.cpp b/libursa/Utils.cpp
@@ -103,6 +103,22 @@ std::optional<TriGram> convert_gram(IndexType type, uint64_t source) {
     return std::make_optional(result[0]);
 }
 
+std::optional<TriGram> convert_gram(IndexType type, int index,
+                                    const QString &string) {
+    int size = get_ngram_size_for(type);
+    if (index + size > string.size()) {
+        return std::nullopt;
+    }
+    uint64_t source = 0;
+    for (int i = 0; i < size; i++) {
+        if (!string[index + i].unique()) {
+            return std::nullopt;
+        }
+        source = (source << 8) | string[index + i].possible_values()[0];
+    }
+    return convert_gram(type, source);
+}
+
 void gen_b64grams(const uint8_t *mem, uint64_t size,
                   const TrigramCallback &cb) {
     if (size < 4) {

diff --git a/libursa/Utils.h b/libursa/Utils.h
@@ -46,6 +46,8 @@ void combinations(const QString &qstr, size_t len, const TrigramGenerator &gen,
 
 // Converts ngram from raw representation to compressed 3byte id.
 std::optional<TriGram> convert_gram(IndexType type, uint64_t source);
+std::optional<TriGram> convert_gram(IndexType type, int index,
+                                    const QString &string);
 
 template <TrigramGenerator gen>
 std::vector<TriGram> get_trigrams_eager(const uint8_t *mem, size_t size) {

diff --git a/teste2e/test_indexing.py b/teste2e/test_indexing.py
@@ -98,7 +98,7 @@ def test_wide8_index_works_as_expected(ursadb: UrsadbTestContext):
     check_query(
         ursadb,
         "{61 (00|01) (62|63) (00|01) (63|62) (00|01) 64 00}",
-        ["vvv", "qqq"],
+        ["kot", "zzz", "yyy", "vvv", "qqq"],
     )
     assert get_index_hash(ursadb, "wide8")[:16] == "c73b55c36445ca6b"