Skip to content

Commit

Permalink
Merge pull request #50906 from johanngan/regexptree-flags
Browse files Browse the repository at this point in the history
Add case insensitive and dot-all modes to RegExpTree dictionary
  • Loading branch information
hanfei1991 committed Sep 10, 2023
2 parents d5ffff9 + 60b0df9 commit b68421b
Show file tree
Hide file tree
Showing 6 changed files with 108 additions and 5 deletions.
6 changes: 6 additions & 0 deletions docs/en/sql-reference/dictionaries/index.md
Expand Up @@ -2361,6 +2361,12 @@ Result:
└────────────────────────────────────────┴───────────────────────────────────────────────────────────────────────────────────────┘
```

#### Matching Modes

Pattern matching behavior can be modified with certain dictionary settings:
- `regexp_dict_flag_case_insensitive`: Use case-insensitive matching (defaults to `false`). Can be overridden in individual expressions with `(?i)` and `(?-i)`.
- `regexp_dict_flag_dotall`: Allow '.' to match newline characters (defaults to `false`).

### Use Regular Expression Tree Dictionary in ClickHouse Cloud

Above used `YAMLRegExpTree` source works in ClickHouse Open Source but not in ClickHouse Cloud. To use regexp tree dictionaries in ClickHouse could, first create a regexp tree dictionary from a YAML file locally in ClickHouse Open Source, then dump this dictionary into a CSV file using the `dictionary` table function and the [INTO OUTFILE](../statements/select/into-outfile.md) clause.
Expand Down
2 changes: 2 additions & 0 deletions src/Core/Settings.h
Expand Up @@ -1053,6 +1053,8 @@ class IColumn;
\
M(Bool, format_display_secrets_in_show_and_select, false, "Do not hide secrets in SHOW and SELECT queries.", IMPORTANT) \
M(Bool, regexp_dict_allow_hyperscan, true, "Allow regexp_tree dictionary using Hyperscan library.", 0) \
M(Bool, regexp_dict_flag_case_insensitive, false, "Use case-insensitive matching for a regexp_tree dictionary. Can be overridden in individual expressions with (?i) and (?-i).", 0) \
M(Bool, regexp_dict_flag_dotall, false, "Allow '.' to match newline characters for a regexp_tree dictionary.", 0) \
\
M(Bool, dictionary_use_async_executor, false, "Execute a pipeline for reading from a dictionary with several threads. It's supported only by DIRECT dictionary with CLICKHOUSE source.", 0) \
M(Bool, precise_float_parsing, false, "Prefer more precise (but slower) float parsing algorithm", 0) \
Expand Down
28 changes: 25 additions & 3 deletions src/Dictionaries/RegExpTreeDictionary.cpp
Expand Up @@ -206,6 +206,8 @@ void RegExpTreeDictionary::initRegexNodes(Block & block)

re2_st::RE2::Options regexp_options;
regexp_options.set_log_errors(false);
regexp_options.set_case_sensitive(!flag_case_insensitive);
regexp_options.set_dot_nl(flag_dotall);
RegexTreeNodePtr node = std::make_shared<RegexTreeNode>(id, parent_id, regex, regexp_options);

int num_captures = std::min(node->searcher.NumberOfCapturingGroups() + 1, 10);
Expand Down Expand Up @@ -330,11 +332,20 @@ void RegExpTreeDictionary::loadData()
std::vector<unsigned int> flags;
std::vector<size_t> lengths;

// Notes:
// - Always set HS_FLAG_SINGLEMATCH because we only care about whether a pattern matches at least once
// - HS_FLAG_CASELESS is supported by hs_compile_lit_multi, so we should set it if flag_case_insensitive is set.
// - HS_FLAG_DOTALL is not supported by hs_compile_lit_multi, but the '.' wildcard can't appear in any of the simple regexps
// anyway, so even if flag_dotall is set, we only need to configure the RE2 searcher, and don't need to set any Hyperscan flags.
unsigned int flag_bits = HS_FLAG_SINGLEMATCH;
if (flag_case_insensitive)
flag_bits |= HS_FLAG_CASELESS;

for (const std::string & simple_regexp : simple_regexps)
{
patterns.push_back(simple_regexp.data());
lengths.push_back(simple_regexp.size());
flags.push_back(HS_FLAG_SINGLEMATCH);
flags.push_back(flag_bits);
}

hs_database_t * db = nullptr;
Expand Down Expand Up @@ -380,12 +391,16 @@ RegExpTreeDictionary::RegExpTreeDictionary(
const DictionaryStructure & structure_,
DictionarySourcePtr source_ptr_,
Configuration configuration_,
bool use_vectorscan_)
bool use_vectorscan_,
bool flag_case_insensitive_,
bool flag_dotall_)
: IDictionary(id_),
structure(structure_),
source_ptr(source_ptr_),
configuration(configuration_),
use_vectorscan(use_vectorscan_),
flag_case_insensitive(flag_case_insensitive_),
flag_dotall(flag_dotall_),
logger(&Poco::Logger::get("RegExpTreeDictionary"))
{
if (auto * ch_source = typeid_cast<ClickHouseDictionarySource *>(source_ptr.get()))
Expand Down Expand Up @@ -859,7 +874,14 @@ void registerDictionaryRegExpTree(DictionaryFactory & factory)

auto context = copyContextAndApplySettingsFromDictionaryConfig(global_context, config, config_prefix);

return std::make_unique<RegExpTreeDictionary>(dict_id, dict_struct, std::move(source_ptr), configuration, context->getSettings().regexp_dict_allow_hyperscan);
return std::make_unique<RegExpTreeDictionary>(
dict_id,
dict_struct,
std::move(source_ptr),
configuration,
context->getSettings().regexp_dict_allow_hyperscan,
context->getSettings().regexp_dict_flag_case_insensitive,
context->getSettings().regexp_dict_flag_dotall);
};

factory.registerLayout("regexp_tree", create_layout, true);
Expand Down
9 changes: 7 additions & 2 deletions src/Dictionaries/RegExpTreeDictionary.h
Expand Up @@ -49,7 +49,9 @@ class RegExpTreeDictionary : public IDictionary
const DictionaryStructure & structure_,
DictionarySourcePtr source_ptr_,
Configuration configuration_,
bool use_vectorscan_);
bool use_vectorscan_,
bool flag_case_insensitive_,
bool flag_dotall_);

std::string getTypeName() const override { return name; }

Expand Down Expand Up @@ -85,7 +87,8 @@ class RegExpTreeDictionary : public IDictionary

std::shared_ptr<const IExternalLoadable> clone() const override
{
return std::make_shared<RegExpTreeDictionary>(getDictionaryID(), structure, source_ptr->clone(), configuration, use_vectorscan);
return std::make_shared<RegExpTreeDictionary>(
getDictionaryID(), structure, source_ptr->clone(), configuration, use_vectorscan, flag_case_insensitive, flag_dotall);
}

ColumnUInt8::Ptr hasKeys(const Columns &, const DataTypes &) const override
Expand Down Expand Up @@ -189,6 +192,8 @@ class RegExpTreeDictionary : public IDictionary
using RegexTreeNodePtr = std::shared_ptr<RegexTreeNode>;

bool use_vectorscan;
bool flag_case_insensitive;
bool flag_dotall;

std::vector<std::string> simple_regexps;
std::vector<UInt64> regexp_ids;
Expand Down
Expand Up @@ -27,3 +27,15 @@ GitHub Documentation
['GitHub Documentation','GitHub']
Documentation
['Documentation']
['foo','(?i)foo','(?-i)foo']
['(?i)foo']
['hello.*world','(?i)hello.*world','(?-i)hello.*world']
[]
['(?i)hello.*world']
[]
['foo','(?i)foo','(?-i)foo']
['foo','(?i)foo']
['hello.*world','(?i)hello.*world','(?-i)hello.*world']
['hello.*world','(?i)hello.*world','(?-i)hello.*world']
['hello.*world','(?i)hello.*world']
['hello.*world','(?i)hello.*world']
56 changes: 56 additions & 0 deletions tests/queries/0_stateless/02504_regexp_dictionary_yaml_source.sh
Expand Up @@ -239,10 +239,66 @@ select dictGet('regexp_dict3', 'tag', '/docs');
select dictGetAll('regexp_dict3', 'tag', '/docs');
"

# Test case-insensitive and dot-all match modes
cat > "$yaml" <<EOL
- regexp: 'foo'
pattern: 'foo'
- regexp: '(?i)foo'
pattern: '(?i)foo'
- regexp: '(?-i)foo'
pattern: '(?-i)foo'
- regexp: 'hello.*world'
pattern: 'hello.*world'
- regexp: '(?i)hello.*world'
pattern: '(?i)hello.*world'
- regexp: '(?-i)hello.*world'
pattern: '(?-i)hello.*world'
EOL

$CLICKHOUSE_CLIENT -n --query="
drop dictionary if exists regexp_dict4;
create dictionary regexp_dict4
(
regexp String,
pattern String
)
PRIMARY KEY(regexp)
SOURCE(YAMLRegExpTree(PATH '$yaml'))
LIFETIME(0)
LAYOUT(regexp_tree);
select dictGetAll('regexp_dict4', 'pattern', 'foo');
select dictGetAll('regexp_dict4', 'pattern', 'FOO');
select dictGetAll('regexp_dict4', 'pattern', 'hello world');
select dictGetAll('regexp_dict4', 'pattern', 'hello\nworld');
select dictGetAll('regexp_dict4', 'pattern', 'HELLO WORLD');
select dictGetAll('regexp_dict4', 'pattern', 'HELLO\nWORLD');
drop dictionary if exists regexp_dict4;
create dictionary regexp_dict4
(
regexp String,
pattern String
)
PRIMARY KEY(regexp)
SOURCE(YAMLRegExpTree(PATH '$yaml'))
LIFETIME(0)
LAYOUT(regexp_tree)
SETTINGS(regexp_dict_flag_case_insensitive = true, regexp_dict_flag_dotall = true);
select dictGetAll('regexp_dict4', 'pattern', 'foo');
select dictGetAll('regexp_dict4', 'pattern', 'FOO');
select dictGetAll('regexp_dict4', 'pattern', 'hello world');
select dictGetAll('regexp_dict4', 'pattern', 'hello\nworld');
select dictGetAll('regexp_dict4', 'pattern', 'HELLO WORLD');
select dictGetAll('regexp_dict4', 'pattern', 'HELLO\nWORLD');
"

$CLICKHOUSE_CLIENT -n --query="
drop dictionary regexp_dict1;
drop dictionary regexp_dict2;
drop dictionary regexp_dict3;
drop dictionary regexp_dict4;
"

rm -rf "$USER_FILES_PATH/test_02504"

0 comments on commit b68421b

Please sign in to comment.