Skip to content

Commit

Permalink
Merge pull request #61853 from ClickHouse/cherrypick/23.8/cd3edf3f5b4…
Browse files Browse the repository at this point in the history
…e716caf1cea9e34fa8e2984d767f6

Cherry pick #61749 to 23.8: Fix crash in `multiSearchAllPositionsCaseInsensitiveUTF8` for incorrect UTF-8
  • Loading branch information
robot-ch-test-poll committed Mar 25, 2024
2 parents 8e17c02 + cd3edf3 commit 5486534
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 8 deletions.
18 changes: 10 additions & 8 deletions src/Common/Volnitsky.h
Expand Up @@ -191,7 +191,8 @@ namespace VolnitskyTraits
if (length_l != length_r)
return false;

assert(length_l >= 2 && length_r >= 2);
if (length_l < 2 || length_r < 2)
return false; /// Some part of the given ngram contains an invalid UTF-8 sequence.

chars.c0 = seq_l[seq_ngram_offset];
chars.c1 = seq_l[seq_ngram_offset + 1];
Expand Down Expand Up @@ -253,7 +254,9 @@ namespace VolnitskyTraits
if (size_l != size_u)
return false;

assert(size_l >= 1 && size_u >= 1);
if (size_l == 0 || size_u == 0)
return false; /// Some part of the given ngram contains an invalid UTF-8 sequence.

chars.c1 = seq_l[0];
putNGramBase(n, offset);

Expand All @@ -276,7 +279,8 @@ namespace VolnitskyTraits
if (size_l != size_u)
return false;

assert(size_l > seq_ngram_offset && size_u > seq_ngram_offset);
if (size_l <= seq_ngram_offset || size_u <= seq_ngram_offset)
return false; /// Some part of the given ngram contains an invalid UTF-8 sequence.

chars.c0 = seq_l[seq_ngram_offset];
putNGramBase(n, offset);
Expand All @@ -302,10 +306,8 @@ namespace VolnitskyTraits
if (size_first_l != size_first_u || size_second_l != size_second_u)
return false;

assert(size_first_l > seq_ngram_offset);
assert(size_first_u > seq_ngram_offset);
assert(size_second_l > 0);
assert(size_second_u > 0);
if (size_first_l <= seq_ngram_offset || size_first_u <= seq_ngram_offset || size_second_l == 0 || size_second_u == 0)
return false;

auto c0l = first_l_seq[seq_ngram_offset];
auto c0u = first_u_seq[seq_ngram_offset];
Expand Down Expand Up @@ -399,7 +401,7 @@ class VolnitskyBase
if (fallback || fallback_searcher.force_fallback)
return;

hash = std::unique_ptr<VolnitskyTraits::Offset[]>(new VolnitskyTraits::Offset[VolnitskyTraits::hash_size]{});
hash = std::make_unique<VolnitskyTraits::Offset[]>(VolnitskyTraits::hash_size);

auto callback = [this](const VolnitskyTraits::Ngram ngram, const int offset) { return this->putNGramBase(ngram, offset); };
/// ssize_t is used here because unsigned can't be used with condition like `i >= 0`, unsigned always >= 0
Expand Down
Expand Up @@ -12872,3 +12872,4 @@
1
1
1
1
Expand Up @@ -223,6 +223,8 @@ select [2] = multiSearchAllPositions(materialize('abab'), materialize(['ba']));
select [1] = multiSearchAllPositionsCaseInsensitive(materialize('aBaB'), materialize(['abab']));
select [3] = multiSearchAllPositionsUTF8(materialize('ab€ab'), materialize(['']));
select [3] = multiSearchAllPositionsCaseInsensitiveUTF8(materialize('ab€AB'), materialize(['€ab']));
-- checks the correct handling of broken utf-8 sequence
select [0] = multiSearchAllPositionsCaseInsensitiveUTF8(materialize(''), materialize(['a\x90\x90\x90\x90\x90\x90']));

select 1 = multiSearchAny(materialize('abcdefgh'), ['b']);
select 1 = multiSearchAny(materialize('abcdefgh'), ['bc']);
Expand Down

0 comments on commit 5486534

Please sign in to comment.