Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix crash in multiSearchAllPositionsCaseInsensitiveUTF8 for incorrect UTF-8 #61749

Merged
merged 4 commits into from Mar 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
18 changes: 10 additions & 8 deletions src/Common/Volnitsky.h
Expand Up @@ -191,7 +191,8 @@ namespace VolnitskyTraits
if (length_l != length_r)
return false;

assert(length_l >= 2 && length_r >= 2);
if (length_l < 2 || length_r < 2)
return false; /// Some part of the given ngram contains an invalid UTF-8 sequence.

chars.c0 = seq_l[seq_ngram_offset];
chars.c1 = seq_l[seq_ngram_offset + 1];
Expand Down Expand Up @@ -253,7 +254,9 @@ namespace VolnitskyTraits
if (size_l != size_u)
return false;

assert(size_l >= 1 && size_u >= 1);
if (size_l == 0 || size_u == 0)
return false; /// Some part of the given ngram contains an invalid UTF-8 sequence.

chars.c1 = seq_l[0];
putNGramBase(n, offset);

Expand All @@ -276,7 +279,8 @@ namespace VolnitskyTraits
if (size_l != size_u)
return false;

assert(size_l > seq_ngram_offset && size_u > seq_ngram_offset);
if (size_l <= seq_ngram_offset || size_u <= seq_ngram_offset)
return false; /// Some part of the given ngram contains an invalid UTF-8 sequence.

chars.c0 = seq_l[seq_ngram_offset];
putNGramBase(n, offset);
Expand All @@ -302,10 +306,8 @@ namespace VolnitskyTraits
if (size_first_l != size_first_u || size_second_l != size_second_u)
return false;

assert(size_first_l > seq_ngram_offset);
assert(size_first_u > seq_ngram_offset);
assert(size_second_l > 0);
assert(size_second_u > 0);
if (size_first_l <= seq_ngram_offset || size_first_u <= seq_ngram_offset || size_second_l == 0 || size_second_u == 0)
return false;

auto c0l = first_l_seq[seq_ngram_offset];
auto c0u = first_u_seq[seq_ngram_offset];
Expand Down Expand Up @@ -399,7 +401,7 @@ class VolnitskyBase
if (fallback || fallback_searcher.force_fallback)
return;

hash = std::unique_ptr<VolnitskyTraits::Offset[]>(new VolnitskyTraits::Offset[VolnitskyTraits::hash_size]{});
hash = std::make_unique<VolnitskyTraits::Offset[]>(VolnitskyTraits::hash_size);

auto callback = [this](const VolnitskyTraits::Ngram ngram, const int offset) { return this->putNGramBase(ngram, offset); };
/// ssize_t is used here because unsigned can't be used with condition like `i >= 0`, unsigned always >= 0
Expand Down
Expand Up @@ -12872,3 +12872,4 @@
1
1
1
1
Expand Up @@ -223,6 +223,8 @@ select [2] = multiSearchAllPositions(materialize('abab'), materialize(['ba']));
select [1] = multiSearchAllPositionsCaseInsensitive(materialize('aBaB'), materialize(['abab']));
select [3] = multiSearchAllPositionsUTF8(materialize('ab€ab'), materialize(['€']));
select [3] = multiSearchAllPositionsCaseInsensitiveUTF8(materialize('ab€AB'), materialize(['€ab']));
-- checks the correct handling of broken utf-8 sequence
select [0] = multiSearchAllPositionsCaseInsensitiveUTF8(materialize(''), materialize(['a\x90\x90\x90\x90\x90\x90']));

select 1 = multiSearchAny(materialize('abcdefgh'), ['b']);
select 1 = multiSearchAny(materialize('abcdefgh'), ['bc']);
Expand Down