Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add case insensitive and dot-all modes to RegExpTree dictionary #50906

Merged
merged 2 commits into from Sep 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/en/sql-reference/dictionaries/index.md
Expand Up @@ -2361,6 +2361,12 @@ Result:
└────────────────────────────────────────┴───────────────────────────────────────────────────────────────────────────────────────┘
```

#### Matching Modes

Pattern matching behavior can be modified with certain dictionary settings:
- `regexp_dict_flag_case_insensitive`: Use case-insensitive matching (defaults to `false`). Can be overridden in individual expressions with `(?i)` and `(?-i)`.
- `regexp_dict_flag_dotall`: Allow '.' to match newline characters (defaults to `false`).

### Use Regular Expression Tree Dictionary in ClickHouse Cloud

Above used `YAMLRegExpTree` source works in ClickHouse Open Source but not in ClickHouse Cloud. To use regexp tree dictionaries in ClickHouse could, first create a regexp tree dictionary from a YAML file locally in ClickHouse Open Source, then dump this dictionary into a CSV file using the `dictionary` table function and the [INTO OUTFILE](../statements/select/into-outfile.md) clause.
Expand Down
2 changes: 2 additions & 0 deletions src/Core/Settings.h
Expand Up @@ -1053,6 +1053,8 @@ class IColumn;
\
M(Bool, format_display_secrets_in_show_and_select, false, "Do not hide secrets in SHOW and SELECT queries.", IMPORTANT) \
M(Bool, regexp_dict_allow_hyperscan, true, "Allow regexp_tree dictionary using Hyperscan library.", 0) \
M(Bool, regexp_dict_flag_case_insensitive, false, "Use case-insensitive matching for a regexp_tree dictionary. Can be overridden in individual expressions with (?i) and (?-i).", 0) \
M(Bool, regexp_dict_flag_dotall, false, "Allow '.' to match newline characters for a regexp_tree dictionary.", 0) \
\
M(Bool, dictionary_use_async_executor, false, "Execute a pipeline for reading from a dictionary with several threads. It's supported only by DIRECT dictionary with CLICKHOUSE source.", 0) \
M(Bool, precise_float_parsing, false, "Prefer more precise (but slower) float parsing algorithm", 0) \
Expand Down
28 changes: 25 additions & 3 deletions src/Dictionaries/RegExpTreeDictionary.cpp
Expand Up @@ -206,6 +206,8 @@ void RegExpTreeDictionary::initRegexNodes(Block & block)

re2_st::RE2::Options regexp_options;
regexp_options.set_log_errors(false);
regexp_options.set_case_sensitive(!flag_case_insensitive);
regexp_options.set_dot_nl(flag_dotall);
RegexTreeNodePtr node = std::make_shared<RegexTreeNode>(id, parent_id, regex, regexp_options);

int num_captures = std::min(node->searcher.NumberOfCapturingGroups() + 1, 10);
Expand Down Expand Up @@ -330,11 +332,20 @@ void RegExpTreeDictionary::loadData()
std::vector<unsigned int> flags;
std::vector<size_t> lengths;

// Notes:
// - Always set HS_FLAG_SINGLEMATCH because we only care about whether a pattern matches at least once
// - HS_FLAG_CASELESS is supported by hs_compile_lit_multi, so we should set it if flag_case_insensitive is set.
// - HS_FLAG_DOTALL is not supported by hs_compile_lit_multi, but the '.' wildcard can't appear in any of the simple regexps
// anyway, so even if flag_dotall is set, we only need to configure the RE2 searcher, and don't need to set any Hyperscan flags.
unsigned int flag_bits = HS_FLAG_SINGLEMATCH;
if (flag_case_insensitive)
flag_bits |= HS_FLAG_CASELESS;

for (const std::string & simple_regexp : simple_regexps)
{
patterns.push_back(simple_regexp.data());
lengths.push_back(simple_regexp.size());
flags.push_back(HS_FLAG_SINGLEMATCH);
flags.push_back(flag_bits);
}

hs_database_t * db = nullptr;
Expand Down Expand Up @@ -380,12 +391,16 @@ RegExpTreeDictionary::RegExpTreeDictionary(
const DictionaryStructure & structure_,
DictionarySourcePtr source_ptr_,
Configuration configuration_,
bool use_vectorscan_)
bool use_vectorscan_,
bool flag_case_insensitive_,
bool flag_dotall_)
: IDictionary(id_),
structure(structure_),
source_ptr(source_ptr_),
configuration(configuration_),
use_vectorscan(use_vectorscan_),
flag_case_insensitive(flag_case_insensitive_),
flag_dotall(flag_dotall_),
logger(&Poco::Logger::get("RegExpTreeDictionary"))
{
if (auto * ch_source = typeid_cast<ClickHouseDictionarySource *>(source_ptr.get()))
Expand Down Expand Up @@ -859,7 +874,14 @@ void registerDictionaryRegExpTree(DictionaryFactory & factory)

auto context = copyContextAndApplySettingsFromDictionaryConfig(global_context, config, config_prefix);

return std::make_unique<RegExpTreeDictionary>(dict_id, dict_struct, std::move(source_ptr), configuration, context->getSettings().regexp_dict_allow_hyperscan);
return std::make_unique<RegExpTreeDictionary>(
dict_id,
dict_struct,
std::move(source_ptr),
configuration,
context->getSettings().regexp_dict_allow_hyperscan,
context->getSettings().regexp_dict_flag_case_insensitive,
context->getSettings().regexp_dict_flag_dotall);
};

factory.registerLayout("regexp_tree", create_layout, true);
Expand Down
9 changes: 7 additions & 2 deletions src/Dictionaries/RegExpTreeDictionary.h
Expand Up @@ -49,7 +49,9 @@ class RegExpTreeDictionary : public IDictionary
const DictionaryStructure & structure_,
DictionarySourcePtr source_ptr_,
Configuration configuration_,
bool use_vectorscan_);
bool use_vectorscan_,
bool flag_case_insensitive_,
bool flag_dotall_);

std::string getTypeName() const override { return name; }

Expand Down Expand Up @@ -85,7 +87,8 @@ class RegExpTreeDictionary : public IDictionary

std::shared_ptr<const IExternalLoadable> clone() const override
{
return std::make_shared<RegExpTreeDictionary>(getDictionaryID(), structure, source_ptr->clone(), configuration, use_vectorscan);
return std::make_shared<RegExpTreeDictionary>(
getDictionaryID(), structure, source_ptr->clone(), configuration, use_vectorscan, flag_case_insensitive, flag_dotall);
}

ColumnUInt8::Ptr hasKeys(const Columns &, const DataTypes &) const override
Expand Down Expand Up @@ -189,6 +192,8 @@ class RegExpTreeDictionary : public IDictionary
using RegexTreeNodePtr = std::shared_ptr<RegexTreeNode>;

bool use_vectorscan;
bool flag_case_insensitive;
bool flag_dotall;

std::vector<std::string> simple_regexps;
std::vector<UInt64> regexp_ids;
Expand Down
Expand Up @@ -27,3 +27,15 @@ GitHub Documentation
['GitHub Documentation','GitHub']
Documentation
['Documentation']
['foo','(?i)foo','(?-i)foo']
['(?i)foo']
['hello.*world','(?i)hello.*world','(?-i)hello.*world']
[]
['(?i)hello.*world']
[]
['foo','(?i)foo','(?-i)foo']
['foo','(?i)foo']
['hello.*world','(?i)hello.*world','(?-i)hello.*world']
['hello.*world','(?i)hello.*world','(?-i)hello.*world']
['hello.*world','(?i)hello.*world']
['hello.*world','(?i)hello.*world']
56 changes: 56 additions & 0 deletions tests/queries/0_stateless/02504_regexp_dictionary_yaml_source.sh
Expand Up @@ -239,10 +239,66 @@ select dictGet('regexp_dict3', 'tag', '/docs');
select dictGetAll('regexp_dict3', 'tag', '/docs');
"

# Test case-insensitive and dot-all match modes
cat > "$yaml" <<EOL
- regexp: 'foo'
pattern: 'foo'
- regexp: '(?i)foo'
pattern: '(?i)foo'
- regexp: '(?-i)foo'
pattern: '(?-i)foo'
- regexp: 'hello.*world'
pattern: 'hello.*world'
- regexp: '(?i)hello.*world'
pattern: '(?i)hello.*world'
- regexp: '(?-i)hello.*world'
pattern: '(?-i)hello.*world'
EOL

$CLICKHOUSE_CLIENT -n --query="
drop dictionary if exists regexp_dict4;
create dictionary regexp_dict4
(
regexp String,
pattern String
)
PRIMARY KEY(regexp)
SOURCE(YAMLRegExpTree(PATH '$yaml'))
LIFETIME(0)
LAYOUT(regexp_tree);

select dictGetAll('regexp_dict4', 'pattern', 'foo');
select dictGetAll('regexp_dict4', 'pattern', 'FOO');
select dictGetAll('regexp_dict4', 'pattern', 'hello world');
select dictGetAll('regexp_dict4', 'pattern', 'hello\nworld');
select dictGetAll('regexp_dict4', 'pattern', 'HELLO WORLD');
select dictGetAll('regexp_dict4', 'pattern', 'HELLO\nWORLD');

drop dictionary if exists regexp_dict4;
create dictionary regexp_dict4
(
regexp String,
pattern String
)
PRIMARY KEY(regexp)
SOURCE(YAMLRegExpTree(PATH '$yaml'))
LIFETIME(0)
LAYOUT(regexp_tree)
SETTINGS(regexp_dict_flag_case_insensitive = true, regexp_dict_flag_dotall = true);

select dictGetAll('regexp_dict4', 'pattern', 'foo');
select dictGetAll('regexp_dict4', 'pattern', 'FOO');
select dictGetAll('regexp_dict4', 'pattern', 'hello world');
select dictGetAll('regexp_dict4', 'pattern', 'hello\nworld');
select dictGetAll('regexp_dict4', 'pattern', 'HELLO WORLD');
select dictGetAll('regexp_dict4', 'pattern', 'HELLO\nWORLD');
"

$CLICKHOUSE_CLIENT -n --query="
drop dictionary regexp_dict1;
drop dictionary regexp_dict2;
drop dictionary regexp_dict3;
drop dictionary regexp_dict4;
"

rm -rf "$USER_FILES_PATH/test_02504"