Merge pull request #51735 from arenadata/ADQM-976

ClickHouse · Jul 17, 2023 · 5de1cfe · 5de1cfe
2 parents db1b172 + 6d7e985
commit 5de1cfe
Show file tree

Hide file tree

Showing 9 changed files with 234 additions and 2 deletions.
diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md
@@ -1255,3 +1255,15 @@ Result:
 │ A240             │
 └──────────────────┘
 ```
+
+## initcap
+
+Convert the first letter of each word to upper case and the rest to lower case. Words are sequences of alphanumeric characters separated by non-alphanumeric characters.
+
+## initcapUTF8
+
+Like [initcap](#initcap), assuming that the string contains valid UTF-8 encoded text. If this assumption is violated, no exception is thrown and the result is undefined.
+
+Does not detect the language, e.g. for Turkish the result might not be exactly correct (i/İ vs. i/I).
+
+If the length of the UTF-8 byte sequence is different for upper and lower case of a code point, the result may be incorrect for this code point.
diff --git a/docs/ru/sql-reference/functions/string-functions.md b/docs/ru/sql-reference/functions/string-functions.md
@@ -1113,3 +1113,14 @@ A text with tags .
 The content within <b>CDATA</b>
 Do Nothing for 2 Minutes 2:00 &nbsp;
 ```
+
+## initcap {#initcap}
+
+Переводит первую букву каждого слова в строке в верхний регистр, а остальные — в нижний. Словами считаются последовательности алфавитно-цифровых символов, разделённые любыми другими символами.
+
+## initcapUTF8 {#initcapUTF8}
+
+Как [initcap](#initcap), предполагая, что строка содержит набор байтов, представляющий текст в кодировке UTF-8.
+Не учитывает язык. То есть, для турецкого языка, результат может быть не совсем верным.
+Если длина UTF-8 последовательности байтов различна для верхнего и нижнего регистра кодовой точки, то для этой кодовой точки результат работы может быть некорректным.
+Если строка содержит набор байтов, не являющийся UTF-8, то поведение не определено.
diff --git a/src/Functions/LowerUpperUTF8Impl.h b/src/Functions/LowerUpperUTF8Impl.h
@@ -133,8 +133,6 @@ struct LowerUpperUTF8Impl
         }
         else
         {
-            static const Poco::UTF8Encoding utf8;
-
             size_t src_sequence_length = UTF8::seqLength(*src);
             /// In case partial buffer was passed (due to SSE optimization)
             /// we cannot convert it with current src_end, but we may have more

diff --git a/src/Functions/initcap.cpp b/src/Functions/initcap.cpp
@@ -0,0 +1,66 @@
+#include <Functions/FunctionFactory.h>
+#include <Functions/FunctionStringToString.h>
+#include <Common/StringUtils/StringUtils.h>
+
+namespace DB
+{
+namespace
+{
+
+struct InitcapImpl
+{
+    static void vector(const ColumnString::Chars & data,
+        const ColumnString::Offsets & offsets,
+        ColumnString::Chars & res_data,
+        ColumnString::Offsets & res_offsets)
+    {
+        if (data.empty())
+            return;
+        res_data.resize(data.size());
+        res_offsets.assign(offsets);
+        array(data.data(), data.data() + data.size(), res_data.data());
+    }
+
+    static void vectorFixed(const ColumnString::Chars & data, size_t /*n*/, ColumnString::Chars & res_data)
+    {
+        res_data.resize(data.size());
+        array(data.data(), data.data() + data.size(), res_data.data());
+    }
+
+private:
+    static void array(const UInt8 * src, const UInt8 * src_end, UInt8 * dst)
+    {
+        bool prev_alphanum = false;
+
+        for (; src < src_end; ++src, ++dst)
+        {
+            char c = *src;
+            bool alphanum = isAlphaNumericASCII(c);
+            if (alphanum && !prev_alphanum)
+                if (isAlphaASCII(c))
+                    *dst = toUpperIfAlphaASCII(c);
+                else
+                    *dst = c;
+            else if (isAlphaASCII(c))
+                *dst = toLowerIfAlphaASCII(c);
+            else
+                *dst = c;
+            prev_alphanum = alphanum;
+        }
+    }
+};
+
+struct NameInitcap
+{
+    static constexpr auto name = "initcap";
+};
+using FunctionInitcap = FunctionStringToString<InitcapImpl, NameInitcap>;
+
+}
+
+REGISTER_FUNCTION(Initcap)
+{
+    factory.registerFunction<FunctionInitcap>({}, FunctionFactory::CaseInsensitive);
+}
+
+}
diff --git a/src/Functions/initcapUTF8.cpp b/src/Functions/initcapUTF8.cpp
@@ -0,0 +1,114 @@
+#include <DataTypes/DataTypeString.h>
+#include <Functions/FunctionStringToString.h>
+#include <Functions/LowerUpperUTF8Impl.h>
+#include <Functions/FunctionFactory.h>
+#include <Poco/Unicode.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int BAD_ARGUMENTS;
+}
+
+namespace
+{
+
+struct InitcapUTF8Impl
+{
+    static void vector(
+        const ColumnString::Chars & data,
+        const ColumnString::Offsets & offsets,
+        ColumnString::Chars & res_data,
+        ColumnString::Offsets & res_offsets)
+    {
+        if (data.empty())
+            return;
+        res_data.resize(data.size());
+        res_offsets.assign(offsets);
+        array(data.data(), data.data() + data.size(), offsets, res_data.data());
+    }
+
+    [[noreturn]] static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
+    {
+        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Function initcapUTF8 cannot work with FixedString argument");
+    }
+
+    static void processCodePoint(const UInt8 *& src, const UInt8 * src_end, UInt8 *& dst, bool& prev_alphanum)
+    {
+        size_t src_sequence_length = UTF8::seqLength(*src);
+        auto src_code_point = UTF8::convertUTF8ToCodePoint(src, src_end - src);
+
+        if (src_code_point)
+        {
+            bool alpha = Poco::Unicode::isAlpha(*src_code_point);
+            bool alphanum = alpha || Poco::Unicode::isDigit(*src_code_point);
+
+            int dst_code_point = *src_code_point;
+            if (alphanum && !prev_alphanum)
+            {
+                if (alpha)
+                    dst_code_point = Poco::Unicode::toUpper(*src_code_point);
+            }
+            else if (alpha)
+            {
+                dst_code_point = Poco::Unicode::toLower(*src_code_point);
+            }
+            prev_alphanum = alphanum;
+            if (dst_code_point > 0)
+            {
+                size_t dst_sequence_length = UTF8::convertCodePointToUTF8(dst_code_point, dst, src_end - src);
+                assert(dst_sequence_length <= 4);
+
+                if (dst_sequence_length == src_sequence_length)
+                {
+                    src += dst_sequence_length;
+                    dst += dst_sequence_length;
+                    return;
+                }
+            }
+        }
+
+        *dst = *src;
+        ++dst;
+        ++src;
+        prev_alphanum = false;
+    }
+
+private:
+
+    static void array(const UInt8 * src, const UInt8 * src_end, const ColumnString::Offsets & offsets, UInt8 * dst)
+    {
+        const auto * offset_it = offsets.begin();
+        const UInt8 * begin = src;
+
+        /// handle remaining symbols, row by row (to avoid influence of bad UTF8 symbols from one row, to another)
+        while (src < src_end)
+        {
+            const UInt8 * row_end = begin + *offset_it;
+            chassert(row_end >= src);
+            bool prev_alphanum = false;
+            while (src < row_end)
+                processCodePoint(src, row_end, dst, prev_alphanum);
+            ++offset_it;
+        }
+    }
+};
+
+struct NameInitcapUTF8
+{
+    static constexpr auto name = "initcapUTF8";
+};
+
+using FunctionInitcapUTF8 = FunctionStringToString<InitcapUTF8Impl, NameInitcapUTF8>;
+
+}
+
+REGISTER_FUNCTION(InitcapUTF8)
+{
+    factory.registerFunction<FunctionInitcapUTF8>();
+}
+
+}
diff --git a/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference b/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference
@@ -364,6 +364,8 @@ in
 inIgnoreSet
 indexHint
 indexOf
+initcap
+initcapUTF8
 initialQueryID
 initializeAggregation
 intDiv

diff --git a/tests/queries/0_stateless/02810_initcap.reference b/tests/queries/0_stateless/02810_initcap.reference
@@ -0,0 +1,13 @@
+
+Hello
+Hello
+Hello World
+Yeah, Well, I`M Gonna Go Build My Own Theme Park
+Crc32ieee Is The Best Function
+42ok
+
+Hello
+Yeah, Well, I`M Gonna Go Build My Own Theme Park
+Привет, Как Дела?
+Ätsch, Bätsch
+We Dont Support Cases When Lowercase And Uppercase Characters Occupy Different Number Of Bytes In Utf-8. As An Example, This Happens For ß And ẞ.
diff --git a/tests/queries/0_stateless/02810_initcap.sql b/tests/queries/0_stateless/02810_initcap.sql
@@ -0,0 +1,14 @@
+select initcap('');
+select initcap('Hello');
+select initcap('hello');
+select initcap('hello world');
+select initcap('yeah, well, i`m gonna go build my own theme park');
+select initcap('CRC32IEEE is the best function');
+select initcap('42oK');
+
+select initcapUTF8('');
+select initcapUTF8('Hello');
+select initcapUTF8('yeah, well, i`m gonna go build my own theme park');
+select initcapUTF8('привет, как дела?');
+select initcapUTF8('ätsch, bätsch');
+select initcapUTF8('We dont support cases when lowercase and uppercase characters occupy different number of bytes in UTF-8. As an example, this happens for ß and ẞ.');
diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt
@@ -1582,6 +1582,8 @@ indexOf
 infi
 initialQueryID
 initializeAggregation
+initcap
+initcapUTF
 injective
 innogames
 inodes