From bcb058f999f9826dfcb9beb5435354b95abacc71 Mon Sep 17 00:00:00 2001 From: johanngan Date: Mon, 12 Jun 2023 15:41:55 -0500 Subject: [PATCH] Add case insensitive and dot-all modes to RegExpTree dictionary The new per-dictionary settings control regex match semantics around case sensitivity and the '.' wildcard with newlines. They must be set at the dictionary level since they're applied to regex engines at pattern-compile-time. - regexp_dict_flag_case_insensitive: case insensitive matching - regexp_dict_flag_dotall: '.' matches all characters including newlines They correspond to HS_FLAG_CASELESS and HS_FLAG_DOTALL in Vectorscan and case_sensitive and dot_nl in RE2. These are the most useful options compatible with the internal behavior of RegExpTreeDictionary around splitting up simple and complex patterns between Vectorscan and RE2. The alternative is to use (?i) and/or (?s) for all patterns. However, (?s) isn't handled properly by OptimizedRegularExpression::analyze(). And while (?i) is, it still causes the dictionary to treat the pattern as "complex" for sequential scanning with RE2 rather than multi-matching with Vectorscan, even though Vectorscan supports case insensitive literal matching. Setting dictionary-wide flags is both more convenient, and circumvents these problems. --- docs/en/sql-reference/dictionaries/index.md | 6 ++ src/Core/Settings.h | 2 + src/Dictionaries/RegExpTreeDictionary.cpp | 28 +++++++++- src/Dictionaries/RegExpTreeDictionary.h | 9 ++- ...04_regexp_dictionary_yaml_source.reference | 12 ++++ .../02504_regexp_dictionary_yaml_source.sh | 56 +++++++++++++++++++ 6 files changed, 108 insertions(+), 5 deletions(-) diff --git a/docs/en/sql-reference/dictionaries/index.md b/docs/en/sql-reference/dictionaries/index.md index dd8031461e08..80f728a33df5 100644 --- a/docs/en/sql-reference/dictionaries/index.md +++ b/docs/en/sql-reference/dictionaries/index.md @@ -2361,6 +2361,12 @@ Result: └────────────────────────────────────────┴───────────────────────────────────────────────────────────────────────────────────────┘ ``` +#### Matching Modes + +Pattern matching behavior can be modified with certain dictionary settings: +- `regexp_dict_flag_case_insensitive`: Use case-insensitive matching (defaults to `false`). Can be overridden in individual expressions with `(?i)` and `(?-i)`. +- `regexp_dict_flag_dotall`: Allow '.' to match newline characters (defaults to `false`). + ### Use Regular Expression Tree Dictionary in ClickHouse Cloud Above used `YAMLRegExpTree` source works in ClickHouse Open Source but not in ClickHouse Cloud. To use regexp tree dictionaries in ClickHouse could, first create a regexp tree dictionary from a YAML file locally in ClickHouse Open Source, then dump this dictionary into a CSV file using the `dictionary` table function and the [INTO OUTFILE](../statements/select/into-outfile.md) clause. diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 620cc8fd67f9..b3a44a7a8eec 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -1053,6 +1053,8 @@ class IColumn; \ M(Bool, format_display_secrets_in_show_and_select, false, "Do not hide secrets in SHOW and SELECT queries.", IMPORTANT) \ M(Bool, regexp_dict_allow_hyperscan, true, "Allow regexp_tree dictionary using Hyperscan library.", 0) \ + M(Bool, regexp_dict_flag_case_insensitive, false, "Use case-insensitive matching for a regexp_tree dictionary. Can be overridden in individual expressions with (?i) and (?-i).", 0) \ + M(Bool, regexp_dict_flag_dotall, false, "Allow '.' to match newline characters for a regexp_tree dictionary.", 0) \ \ M(Bool, dictionary_use_async_executor, false, "Execute a pipeline for reading from a dictionary with several threads. It's supported only by DIRECT dictionary with CLICKHOUSE source.", 0) \ M(Bool, precise_float_parsing, false, "Prefer more precise (but slower) float parsing algorithm", 0) \ diff --git a/src/Dictionaries/RegExpTreeDictionary.cpp b/src/Dictionaries/RegExpTreeDictionary.cpp index e9ba4c6268ec..29ef71b3ce0f 100644 --- a/src/Dictionaries/RegExpTreeDictionary.cpp +++ b/src/Dictionaries/RegExpTreeDictionary.cpp @@ -206,6 +206,8 @@ void RegExpTreeDictionary::initRegexNodes(Block & block) re2_st::RE2::Options regexp_options; regexp_options.set_log_errors(false); + regexp_options.set_case_sensitive(!flag_case_insensitive); + regexp_options.set_dot_nl(flag_dotall); RegexTreeNodePtr node = std::make_shared(id, parent_id, regex, regexp_options); int num_captures = std::min(node->searcher.NumberOfCapturingGroups() + 1, 10); @@ -330,11 +332,20 @@ void RegExpTreeDictionary::loadData() std::vector flags; std::vector lengths; + // Notes: + // - Always set HS_FLAG_SINGLEMATCH because we only care about whether a pattern matches at least once + // - HS_FLAG_CASELESS is supported by hs_compile_lit_multi, so we should set it if flag_case_insensitive is set. + // - HS_FLAG_DOTALL is not supported by hs_compile_lit_multi, but the '.' wildcard can't appear in any of the simple regexps + // anyway, so even if flag_dotall is set, we only need to configure the RE2 searcher, and don't need to set any Hyperscan flags. + unsigned int flag_bits = HS_FLAG_SINGLEMATCH; + if (flag_case_insensitive) + flag_bits |= HS_FLAG_CASELESS; + for (const std::string & simple_regexp : simple_regexps) { patterns.push_back(simple_regexp.data()); lengths.push_back(simple_regexp.size()); - flags.push_back(HS_FLAG_SINGLEMATCH); + flags.push_back(flag_bits); } hs_database_t * db = nullptr; @@ -380,12 +391,16 @@ RegExpTreeDictionary::RegExpTreeDictionary( const DictionaryStructure & structure_, DictionarySourcePtr source_ptr_, Configuration configuration_, - bool use_vectorscan_) + bool use_vectorscan_, + bool flag_case_insensitive_, + bool flag_dotall_) : IDictionary(id_), structure(structure_), source_ptr(source_ptr_), configuration(configuration_), use_vectorscan(use_vectorscan_), + flag_case_insensitive(flag_case_insensitive_), + flag_dotall(flag_dotall_), logger(&Poco::Logger::get("RegExpTreeDictionary")) { if (auto * ch_source = typeid_cast(source_ptr.get())) @@ -859,7 +874,14 @@ void registerDictionaryRegExpTree(DictionaryFactory & factory) auto context = copyContextAndApplySettingsFromDictionaryConfig(global_context, config, config_prefix); - return std::make_unique(dict_id, dict_struct, std::move(source_ptr), configuration, context->getSettings().regexp_dict_allow_hyperscan); + return std::make_unique( + dict_id, + dict_struct, + std::move(source_ptr), + configuration, + context->getSettings().regexp_dict_allow_hyperscan, + context->getSettings().regexp_dict_flag_case_insensitive, + context->getSettings().regexp_dict_flag_dotall); }; factory.registerLayout("regexp_tree", create_layout, true); diff --git a/src/Dictionaries/RegExpTreeDictionary.h b/src/Dictionaries/RegExpTreeDictionary.h index 7f1d0ee1e881..62008bb5aaec 100644 --- a/src/Dictionaries/RegExpTreeDictionary.h +++ b/src/Dictionaries/RegExpTreeDictionary.h @@ -49,7 +49,9 @@ class RegExpTreeDictionary : public IDictionary const DictionaryStructure & structure_, DictionarySourcePtr source_ptr_, Configuration configuration_, - bool use_vectorscan_); + bool use_vectorscan_, + bool flag_case_insensitive_, + bool flag_dotall_); std::string getTypeName() const override { return name; } @@ -85,7 +87,8 @@ class RegExpTreeDictionary : public IDictionary std::shared_ptr clone() const override { - return std::make_shared(getDictionaryID(), structure, source_ptr->clone(), configuration, use_vectorscan); + return std::make_shared( + getDictionaryID(), structure, source_ptr->clone(), configuration, use_vectorscan, flag_case_insensitive, flag_dotall); } ColumnUInt8::Ptr hasKeys(const Columns &, const DataTypes &) const override @@ -189,6 +192,8 @@ class RegExpTreeDictionary : public IDictionary using RegexTreeNodePtr = std::shared_ptr; bool use_vectorscan; + bool flag_case_insensitive; + bool flag_dotall; std::vector simple_regexps; std::vector regexp_ids; diff --git a/tests/queries/0_stateless/02504_regexp_dictionary_yaml_source.reference b/tests/queries/0_stateless/02504_regexp_dictionary_yaml_source.reference index 79871e3716cd..de5f14ee1ffc 100644 --- a/tests/queries/0_stateless/02504_regexp_dictionary_yaml_source.reference +++ b/tests/queries/0_stateless/02504_regexp_dictionary_yaml_source.reference @@ -27,3 +27,15 @@ GitHub Documentation ['GitHub Documentation','GitHub'] Documentation ['Documentation'] +['foo','(?i)foo','(?-i)foo'] +['(?i)foo'] +['hello.*world','(?i)hello.*world','(?-i)hello.*world'] +[] +['(?i)hello.*world'] +[] +['foo','(?i)foo','(?-i)foo'] +['foo','(?i)foo'] +['hello.*world','(?i)hello.*world','(?-i)hello.*world'] +['hello.*world','(?i)hello.*world','(?-i)hello.*world'] +['hello.*world','(?i)hello.*world'] +['hello.*world','(?i)hello.*world'] diff --git a/tests/queries/0_stateless/02504_regexp_dictionary_yaml_source.sh b/tests/queries/0_stateless/02504_regexp_dictionary_yaml_source.sh index 5e8985406ae8..34ea8b9bfbed 100755 --- a/tests/queries/0_stateless/02504_regexp_dictionary_yaml_source.sh +++ b/tests/queries/0_stateless/02504_regexp_dictionary_yaml_source.sh @@ -239,10 +239,66 @@ select dictGet('regexp_dict3', 'tag', '/docs'); select dictGetAll('regexp_dict3', 'tag', '/docs'); " +# Test case-insensitive and dot-all match modes +cat > "$yaml" <