Skip to content

Commit

Permalink
Merge pull request #51735 from arenadata/ADQM-976
Browse files Browse the repository at this point in the history
  • Loading branch information
vdimir committed Jul 17, 2023
2 parents db1b172 + 6d7e985 commit 5de1cfe
Show file tree
Hide file tree
Showing 9 changed files with 234 additions and 2 deletions.
12 changes: 12 additions & 0 deletions docs/en/sql-reference/functions/string-functions.md
Expand Up @@ -1255,3 +1255,15 @@ Result:
│ A240 │
└──────────────────┘
```

## initcap

Convert the first letter of each word to upper case and the rest to lower case. Words are sequences of alphanumeric characters separated by non-alphanumeric characters.

## initcapUTF8

Like [initcap](#initcap), assuming that the string contains valid UTF-8 encoded text. If this assumption is violated, no exception is thrown and the result is undefined.

Does not detect the language, e.g. for Turkish the result might not be exactly correct (i/İ vs. i/I).

If the length of the UTF-8 byte sequence is different for upper and lower case of a code point, the result may be incorrect for this code point.
11 changes: 11 additions & 0 deletions docs/ru/sql-reference/functions/string-functions.md
Expand Up @@ -1113,3 +1113,14 @@ A text with tags .
The content within <b>CDATA</b>
Do Nothing for 2 Minutes 2:00 &nbsp;
```

## initcap {#initcap}

Переводит первую букву каждого слова в строке в верхний регистр, а остальные — в нижний. Словами считаются последовательности алфавитно-цифровых символов, разделённые любыми другими символами.

## initcapUTF8 {#initcapUTF8}

Как [initcap](#initcap), предполагая, что строка содержит набор байтов, представляющий текст в кодировке UTF-8.
Не учитывает язык. То есть, для турецкого языка, результат может быть не совсем верным.
Если длина UTF-8 последовательности байтов различна для верхнего и нижнего регистра кодовой точки, то для этой кодовой точки результат работы может быть некорректным.
Если строка содержит набор байтов, не являющийся UTF-8, то поведение не определено.
2 changes: 0 additions & 2 deletions src/Functions/LowerUpperUTF8Impl.h
Expand Up @@ -133,8 +133,6 @@ struct LowerUpperUTF8Impl
}
else
{
static const Poco::UTF8Encoding utf8;

size_t src_sequence_length = UTF8::seqLength(*src);
/// In case partial buffer was passed (due to SSE optimization)
/// we cannot convert it with current src_end, but we may have more
Expand Down
66 changes: 66 additions & 0 deletions src/Functions/initcap.cpp
@@ -0,0 +1,66 @@
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionStringToString.h>
#include <Common/StringUtils/StringUtils.h>

namespace DB
{
namespace
{

struct InitcapImpl
{
static void vector(const ColumnString::Chars & data,
const ColumnString::Offsets & offsets,
ColumnString::Chars & res_data,
ColumnString::Offsets & res_offsets)
{
if (data.empty())
return;
res_data.resize(data.size());
res_offsets.assign(offsets);
array(data.data(), data.data() + data.size(), res_data.data());
}

static void vectorFixed(const ColumnString::Chars & data, size_t /*n*/, ColumnString::Chars & res_data)
{
res_data.resize(data.size());
array(data.data(), data.data() + data.size(), res_data.data());
}

private:
static void array(const UInt8 * src, const UInt8 * src_end, UInt8 * dst)
{
bool prev_alphanum = false;

for (; src < src_end; ++src, ++dst)
{
char c = *src;
bool alphanum = isAlphaNumericASCII(c);
if (alphanum && !prev_alphanum)
if (isAlphaASCII(c))
*dst = toUpperIfAlphaASCII(c);
else
*dst = c;
else if (isAlphaASCII(c))
*dst = toLowerIfAlphaASCII(c);
else
*dst = c;
prev_alphanum = alphanum;
}
}
};

struct NameInitcap
{
static constexpr auto name = "initcap";
};
using FunctionInitcap = FunctionStringToString<InitcapImpl, NameInitcap>;

}

REGISTER_FUNCTION(Initcap)
{
factory.registerFunction<FunctionInitcap>({}, FunctionFactory::CaseInsensitive);
}

}
114 changes: 114 additions & 0 deletions src/Functions/initcapUTF8.cpp
@@ -0,0 +1,114 @@
#include <DataTypes/DataTypeString.h>
#include <Functions/FunctionStringToString.h>
#include <Functions/LowerUpperUTF8Impl.h>
#include <Functions/FunctionFactory.h>
#include <Poco/Unicode.h>


namespace DB
{

namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
}

namespace
{

struct InitcapUTF8Impl
{
static void vector(
const ColumnString::Chars & data,
const ColumnString::Offsets & offsets,
ColumnString::Chars & res_data,
ColumnString::Offsets & res_offsets)
{
if (data.empty())
return;
res_data.resize(data.size());
res_offsets.assign(offsets);
array(data.data(), data.data() + data.size(), offsets, res_data.data());
}

[[noreturn]] static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
{
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Function initcapUTF8 cannot work with FixedString argument");
}

static void processCodePoint(const UInt8 *& src, const UInt8 * src_end, UInt8 *& dst, bool& prev_alphanum)
{
size_t src_sequence_length = UTF8::seqLength(*src);
auto src_code_point = UTF8::convertUTF8ToCodePoint(src, src_end - src);

if (src_code_point)
{
bool alpha = Poco::Unicode::isAlpha(*src_code_point);
bool alphanum = alpha || Poco::Unicode::isDigit(*src_code_point);

int dst_code_point = *src_code_point;
if (alphanum && !prev_alphanum)
{
if (alpha)
dst_code_point = Poco::Unicode::toUpper(*src_code_point);
}
else if (alpha)
{
dst_code_point = Poco::Unicode::toLower(*src_code_point);
}
prev_alphanum = alphanum;
if (dst_code_point > 0)
{
size_t dst_sequence_length = UTF8::convertCodePointToUTF8(dst_code_point, dst, src_end - src);
assert(dst_sequence_length <= 4);

if (dst_sequence_length == src_sequence_length)
{
src += dst_sequence_length;
dst += dst_sequence_length;
return;
}
}
}

*dst = *src;
++dst;
++src;
prev_alphanum = false;
}

private:

static void array(const UInt8 * src, const UInt8 * src_end, const ColumnString::Offsets & offsets, UInt8 * dst)
{
const auto * offset_it = offsets.begin();
const UInt8 * begin = src;

/// handle remaining symbols, row by row (to avoid influence of bad UTF8 symbols from one row, to another)
while (src < src_end)
{
const UInt8 * row_end = begin + *offset_it;
chassert(row_end >= src);
bool prev_alphanum = false;
while (src < row_end)
processCodePoint(src, row_end, dst, prev_alphanum);
++offset_it;
}
}
};

struct NameInitcapUTF8
{
static constexpr auto name = "initcapUTF8";
};

using FunctionInitcapUTF8 = FunctionStringToString<InitcapUTF8Impl, NameInitcapUTF8>;

}

REGISTER_FUNCTION(InitcapUTF8)
{
factory.registerFunction<FunctionInitcapUTF8>();
}

}
Expand Up @@ -364,6 +364,8 @@ in
inIgnoreSet
indexHint
indexOf
initcap
initcapUTF8
initialQueryID
initializeAggregation
intDiv
Expand Down
13 changes: 13 additions & 0 deletions tests/queries/0_stateless/02810_initcap.reference
@@ -0,0 +1,13 @@

Hello
Hello
Hello World
Yeah, Well, I`M Gonna Go Build My Own Theme Park
Crc32ieee Is The Best Function
42ok

Hello
Yeah, Well, I`M Gonna Go Build My Own Theme Park
Привет, Как Дела?
Ätsch, Bätsch
We Dont Support Cases When Lowercase And Uppercase Characters Occupy Different Number Of Bytes In Utf-8. As An Example, This Happens For ß And ẞ.
14 changes: 14 additions & 0 deletions tests/queries/0_stateless/02810_initcap.sql
@@ -0,0 +1,14 @@
select initcap('');
select initcap('Hello');
select initcap('hello');
select initcap('hello world');
select initcap('yeah, well, i`m gonna go build my own theme park');
select initcap('CRC32IEEE is the best function');
select initcap('42oK');

select initcapUTF8('');
select initcapUTF8('Hello');
select initcapUTF8('yeah, well, i`m gonna go build my own theme park');
select initcapUTF8('привет, как дела?');
select initcapUTF8('ätsch, bätsch');
select initcapUTF8('We dont support cases when lowercase and uppercase characters occupy different number of bytes in UTF-8. As an example, this happens for ß and ẞ.');
2 changes: 2 additions & 0 deletions utils/check-style/aspell-ignore/en/aspell-dict.txt
Expand Up @@ -1582,6 +1582,8 @@ indexOf
infi
initialQueryID
initializeAggregation
initcap
initcapUTF
injective
innogames
inodes
Expand Down

0 comments on commit 5de1cfe

Please sign in to comment.