-
Notifications
You must be signed in to change notification settings - Fork 6.5k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #51735 from arenadata/ADQM-976
- Loading branch information
Showing
9 changed files
with
234 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
#include <Functions/FunctionFactory.h> | ||
#include <Functions/FunctionStringToString.h> | ||
#include <Common/StringUtils/StringUtils.h> | ||
|
||
namespace DB | ||
{ | ||
namespace | ||
{ | ||
|
||
struct InitcapImpl | ||
{ | ||
static void vector(const ColumnString::Chars & data, | ||
const ColumnString::Offsets & offsets, | ||
ColumnString::Chars & res_data, | ||
ColumnString::Offsets & res_offsets) | ||
{ | ||
if (data.empty()) | ||
return; | ||
res_data.resize(data.size()); | ||
res_offsets.assign(offsets); | ||
array(data.data(), data.data() + data.size(), res_data.data()); | ||
} | ||
|
||
static void vectorFixed(const ColumnString::Chars & data, size_t /*n*/, ColumnString::Chars & res_data) | ||
{ | ||
res_data.resize(data.size()); | ||
array(data.data(), data.data() + data.size(), res_data.data()); | ||
} | ||
|
||
private: | ||
static void array(const UInt8 * src, const UInt8 * src_end, UInt8 * dst) | ||
{ | ||
bool prev_alphanum = false; | ||
|
||
for (; src < src_end; ++src, ++dst) | ||
{ | ||
char c = *src; | ||
bool alphanum = isAlphaNumericASCII(c); | ||
if (alphanum && !prev_alphanum) | ||
if (isAlphaASCII(c)) | ||
*dst = toUpperIfAlphaASCII(c); | ||
else | ||
*dst = c; | ||
else if (isAlphaASCII(c)) | ||
*dst = toLowerIfAlphaASCII(c); | ||
else | ||
*dst = c; | ||
prev_alphanum = alphanum; | ||
} | ||
} | ||
}; | ||
|
||
struct NameInitcap | ||
{ | ||
static constexpr auto name = "initcap"; | ||
}; | ||
using FunctionInitcap = FunctionStringToString<InitcapImpl, NameInitcap>; | ||
|
||
} | ||
|
||
REGISTER_FUNCTION(Initcap) | ||
{ | ||
factory.registerFunction<FunctionInitcap>({}, FunctionFactory::CaseInsensitive); | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
#include <DataTypes/DataTypeString.h> | ||
#include <Functions/FunctionStringToString.h> | ||
#include <Functions/LowerUpperUTF8Impl.h> | ||
#include <Functions/FunctionFactory.h> | ||
#include <Poco/Unicode.h> | ||
|
||
|
||
namespace DB | ||
{ | ||
|
||
namespace ErrorCodes | ||
{ | ||
extern const int BAD_ARGUMENTS; | ||
} | ||
|
||
namespace | ||
{ | ||
|
||
struct InitcapUTF8Impl | ||
{ | ||
static void vector( | ||
const ColumnString::Chars & data, | ||
const ColumnString::Offsets & offsets, | ||
ColumnString::Chars & res_data, | ||
ColumnString::Offsets & res_offsets) | ||
{ | ||
if (data.empty()) | ||
return; | ||
res_data.resize(data.size()); | ||
res_offsets.assign(offsets); | ||
array(data.data(), data.data() + data.size(), offsets, res_data.data()); | ||
} | ||
|
||
[[noreturn]] static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &) | ||
{ | ||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Function initcapUTF8 cannot work with FixedString argument"); | ||
} | ||
|
||
static void processCodePoint(const UInt8 *& src, const UInt8 * src_end, UInt8 *& dst, bool& prev_alphanum) | ||
{ | ||
size_t src_sequence_length = UTF8::seqLength(*src); | ||
auto src_code_point = UTF8::convertUTF8ToCodePoint(src, src_end - src); | ||
|
||
if (src_code_point) | ||
{ | ||
bool alpha = Poco::Unicode::isAlpha(*src_code_point); | ||
bool alphanum = alpha || Poco::Unicode::isDigit(*src_code_point); | ||
|
||
int dst_code_point = *src_code_point; | ||
if (alphanum && !prev_alphanum) | ||
{ | ||
if (alpha) | ||
dst_code_point = Poco::Unicode::toUpper(*src_code_point); | ||
} | ||
else if (alpha) | ||
{ | ||
dst_code_point = Poco::Unicode::toLower(*src_code_point); | ||
} | ||
prev_alphanum = alphanum; | ||
if (dst_code_point > 0) | ||
{ | ||
size_t dst_sequence_length = UTF8::convertCodePointToUTF8(dst_code_point, dst, src_end - src); | ||
assert(dst_sequence_length <= 4); | ||
|
||
if (dst_sequence_length == src_sequence_length) | ||
{ | ||
src += dst_sequence_length; | ||
dst += dst_sequence_length; | ||
return; | ||
} | ||
} | ||
} | ||
|
||
*dst = *src; | ||
++dst; | ||
++src; | ||
prev_alphanum = false; | ||
} | ||
|
||
private: | ||
|
||
static void array(const UInt8 * src, const UInt8 * src_end, const ColumnString::Offsets & offsets, UInt8 * dst) | ||
{ | ||
const auto * offset_it = offsets.begin(); | ||
const UInt8 * begin = src; | ||
|
||
/// handle remaining symbols, row by row (to avoid influence of bad UTF8 symbols from one row, to another) | ||
while (src < src_end) | ||
{ | ||
const UInt8 * row_end = begin + *offset_it; | ||
chassert(row_end >= src); | ||
bool prev_alphanum = false; | ||
while (src < row_end) | ||
processCodePoint(src, row_end, dst, prev_alphanum); | ||
++offset_it; | ||
} | ||
} | ||
}; | ||
|
||
struct NameInitcapUTF8 | ||
{ | ||
static constexpr auto name = "initcapUTF8"; | ||
}; | ||
|
||
using FunctionInitcapUTF8 = FunctionStringToString<InitcapUTF8Impl, NameInitcapUTF8>; | ||
|
||
} | ||
|
||
REGISTER_FUNCTION(InitcapUTF8) | ||
{ | ||
factory.registerFunction<FunctionInitcapUTF8>(); | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -364,6 +364,8 @@ in | |
inIgnoreSet | ||
indexHint | ||
indexOf | ||
initcap | ||
initcapUTF8 | ||
initialQueryID | ||
initializeAggregation | ||
intDiv | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
|
||
Hello | ||
Hello | ||
Hello World | ||
Yeah, Well, I`M Gonna Go Build My Own Theme Park | ||
Crc32ieee Is The Best Function | ||
42ok | ||
|
||
Hello | ||
Yeah, Well, I`M Gonna Go Build My Own Theme Park | ||
Привет, Как Дела? | ||
Ätsch, Bätsch | ||
We Dont Support Cases When Lowercase And Uppercase Characters Occupy Different Number Of Bytes In Utf-8. As An Example, This Happens For ß And ẞ. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
select initcap(''); | ||
select initcap('Hello'); | ||
select initcap('hello'); | ||
select initcap('hello world'); | ||
select initcap('yeah, well, i`m gonna go build my own theme park'); | ||
select initcap('CRC32IEEE is the best function'); | ||
select initcap('42oK'); | ||
|
||
select initcapUTF8(''); | ||
select initcapUTF8('Hello'); | ||
select initcapUTF8('yeah, well, i`m gonna go build my own theme park'); | ||
select initcapUTF8('привет, как дела?'); | ||
select initcapUTF8('ätsch, bätsch'); | ||
select initcapUTF8('We dont support cases when lowercase and uppercase characters occupy different number of bytes in UTF-8. As an example, this happens for ß and ẞ.'); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1582,6 +1582,8 @@ indexOf | |
infi | ||
initialQueryID | ||
initializeAggregation | ||
initcap | ||
initcapUTF | ||
injective | ||
innogames | ||
inodes | ||
|