Merge branch 'master' into zvonand-issue-49290

ClickHouse · Jul 17, 2023 · 4884022 · 4884022
2 parents d339c22 + 9f45513
commit 4884022
Show file tree

Hide file tree

Showing 24 changed files with 282 additions and 93 deletions.
diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md
@@ -1255,3 +1255,15 @@ Result:
 │ A240             │
 └──────────────────┘
 ```
+
+## initcap
+
+Convert the first letter of each word to upper case and the rest to lower case. Words are sequences of alphanumeric characters separated by non-alphanumeric characters.
+
+## initcapUTF8
+
+Like [initcap](#initcap), assuming that the string contains valid UTF-8 encoded text. If this assumption is violated, no exception is thrown and the result is undefined.
+
+Does not detect the language, e.g. for Turkish the result might not be exactly correct (i/İ vs. i/I).
+
+If the length of the UTF-8 byte sequence is different for upper and lower case of a code point, the result may be incorrect for this code point.
diff --git a/docs/ru/sql-reference/functions/string-functions.md b/docs/ru/sql-reference/functions/string-functions.md
@@ -1113,3 +1113,14 @@ A text with tags .
 The content within <b>CDATA</b>
 Do Nothing for 2 Minutes 2:00 &nbsp;
 ```
+
+## initcap {#initcap}
+
+Переводит первую букву каждого слова в строке в верхний регистр, а остальные — в нижний. Словами считаются последовательности алфавитно-цифровых символов, разделённые любыми другими символами.
+
+## initcapUTF8 {#initcapUTF8}
+
+Как [initcap](#initcap), предполагая, что строка содержит набор байтов, представляющий текст в кодировке UTF-8.
+Не учитывает язык. То есть, для турецкого языка, результат может быть не совсем верным.
+Если длина UTF-8 последовательности байтов различна для верхнего и нижнего регистра кодовой точки, то для этой кодовой точки результат работы может быть некорректным.
+Если строка содержит набор байтов, не являющийся UTF-8, то поведение не определено.
diff --git a/src/AggregateFunctions/fuzzers/aggregate_function_state_deserialization_fuzzer.cpp b/src/AggregateFunctions/fuzzers/aggregate_function_state_deserialization_fuzzer.cpp
@@ -8,7 +8,6 @@
 
 #include <Common/MemoryTracker.h>
 #include <Common/CurrentThread.h>
-#include <Common/Arena.h>
 
 #include <Interpreters/Context.h>
 

diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp
@@ -6223,7 +6223,11 @@ void QueryAnalyzer::resolveTableFunction(QueryTreeNodePtr & table_function_node,
         const auto & insertion_table = scope_context->getInsertionTable();
         if (!insertion_table.empty())
         {
-            const auto & insert_structure = DatabaseCatalog::instance().getTable(insertion_table, scope_context)->getInMemoryMetadataPtr()->getColumns();
+            const auto & insert_structure = DatabaseCatalog::instance()
+                                                .getTable(insertion_table, scope_context)
+                                                ->getInMemoryMetadataPtr()
+                                                ->getColumns()
+                                                .getInsertable();
             DB::ColumnsDescription structure_hint;
 
             bool use_columns_from_insert_query = true;

diff --git a/src/Compression/fuzzers/delta_decompress_fuzzer.cpp b/src/Compression/fuzzers/delta_decompress_fuzzer.cpp
@@ -34,7 +34,7 @@ try
     DB::Memory<> memory;
     memory.resize(output_buffer_size + codec->getAdditionalSizeAtTheEndOfBuffer());
 
-    codec->doDecompressData(reinterpret_cast<const char *>(data), static_cast<UInt32>(size), memory.data(), static_cast<UInt32>(output_buffer_size));
+    codec->doDecompressData(reinterpret_cast<const char *>(data), size, memory.data(), output_buffer_size);
 
     return 0;
 }

diff --git a/src/Compression/fuzzers/double_delta_decompress_fuzzer.cpp b/src/Compression/fuzzers/double_delta_decompress_fuzzer.cpp
@@ -34,7 +34,7 @@ try
     DB::Memory<> memory;
     memory.resize(output_buffer_size + codec->getAdditionalSizeAtTheEndOfBuffer());
 
-    codec->doDecompressData(reinterpret_cast<const char *>(data), static_cast<UInt32>(size), memory.data(), static_cast<UInt32>(output_buffer_size));
+    codec->doDecompressData(reinterpret_cast<const char *>(data), size, memory.data(), output_buffer_size);
 
     return 0;
 }

diff --git a/src/Compression/fuzzers/encrypted_decompress_fuzzer.cpp b/src/Compression/fuzzers/encrypted_decompress_fuzzer.cpp
@@ -292,10 +292,10 @@ try
 
     DB::Memory<> memory;
     memory.resize(input.size() + codec_128->getAdditionalSizeAtTheEndOfBuffer());
-    codec_128->doDecompressData(input.data(), static_cast<UInt32>(input.size()), memory.data(), static_cast<UInt32>(input.size() - 31));
+    codec_128->doDecompressData(input.data(), input.size(), memory.data(), input.size() - 31);
 
     memory.resize(input.size() + codec_128->getAdditionalSizeAtTheEndOfBuffer());
-    codec_256->doDecompressData(input.data(), static_cast<UInt32>(input.size()), memory.data(), static_cast<UInt32>(input.size() - 31));
+    codec_256->doDecompressData(input.data(), input.size(), memory.data(), input.size() - 31);
     return 0;
 }
 catch (...)

diff --git a/src/Compression/fuzzers/lz4_decompress_fuzzer.cpp b/src/Compression/fuzzers/lz4_decompress_fuzzer.cpp
@@ -24,7 +24,7 @@ try
         return 0;
 
     const auto * p = reinterpret_cast<const AuxiliaryRandomData *>(data);
-    auto codec = DB::getCompressionCodecLZ4(static_cast<int>(p->level));
+    auto codec = DB::getCompressionCodecLZ4(p->level);
 
     size_t output_buffer_size = p->decompressed_size % 65536;
     size -= sizeof(AuxiliaryRandomData);
@@ -37,7 +37,7 @@ try
     DB::Memory<> memory;
     memory.resize(output_buffer_size + LZ4::ADDITIONAL_BYTES_AT_END_OF_BUFFER);
 
-    codec->doDecompressData(reinterpret_cast<const char *>(data), static_cast<UInt32>(size), memory.data(), static_cast<UInt32>(output_buffer_size));
+    codec->doDecompressData(reinterpret_cast<const char *>(data), size, memory.data(), output_buffer_size);
 
     return 0;
 }

diff --git a/src/Core/Field.h b/src/Core/Field.h
@@ -28,7 +28,6 @@ namespace ErrorCodes
     extern const int NOT_IMPLEMENTED;
     extern const int LOGICAL_ERROR;
     extern const int ILLEGAL_TYPE_OF_ARGUMENT;
-    extern const int TOO_DEEP_RECURSION;
 }
 
 constexpr Null NEGATIVE_INFINITY{Null::Value::NegativeInfinity};
@@ -42,13 +41,10 @@ using FieldVector = std::vector<Field, AllocatorWithMemoryTracking<Field>>;
 /// construct a Field of Array or a Tuple type. An alternative approach would be
 /// to construct both of these types from FieldVector, and have the caller
 /// specify the desired Field type explicitly.
-/// As the result stack overflow on destruction is possible
-/// and to avoid it we need to count the depth and have a threshold.
 #define DEFINE_FIELD_VECTOR(X) \
 struct X : public FieldVector \
 { \
     using FieldVector::FieldVector; \
-    uint8_t nested_field_depth = 0; \
 }
 
 DEFINE_FIELD_VECTOR(Array);
@@ -65,7 +61,6 @@ using FieldMap = std::map<String, Field, std::less<>, AllocatorWithMemoryTrackin
 struct X : public FieldMap \
 { \
     using FieldMap::FieldMap; \
-    uint8_t nested_field_depth = 0; \
 }
 
 DEFINE_FIELD_MAP(Object);
@@ -296,12 +291,6 @@ decltype(auto) castToNearestFieldType(T && x)
   */
 #define DBMS_MIN_FIELD_SIZE 32
 
-/// Note: uint8_t is used for storing depth value.
-#if defined(SANITIZER) || !defined(NDEBUG)
-    #define DBMS_MAX_NESTED_FIELD_DEPTH 64
-#else
-    #define DBMS_MAX_NESTED_FIELD_DEPTH 255
-#endif
 
 /** Discriminated union of several types.
   * Made for replacement of `boost::variant`
@@ -682,49 +671,6 @@ class Field
 
     Types::Which which;
 
-    /// StorageType and Original are the same for Array, Tuple, Map, Object
-    template <typename StorageType, typename Original>
-    uint8_t calculateAndCheckFieldDepth(Original && x)
-    {
-        uint8_t result = 0;
-
-        if constexpr (std::is_same_v<StorageType, Array>
-            || std::is_same_v<StorageType, Tuple>
-            || std::is_same_v<StorageType, Map>
-            || std::is_same_v<StorageType, Object>)
-        {
-            result = x.nested_field_depth;
-
-            auto get_depth = [](const Field & elem)
-            {
-                switch (elem.which)
-                {
-                    case Types::Array:
-                        return elem.template get<Array>().nested_field_depth;
-                    case Types::Tuple:
-                        return elem.template get<Tuple>().nested_field_depth;
-                    case Types::Map:
-                        return elem.template get<Map>().nested_field_depth;
-                    case Types::Object:
-                        return elem.template get<Object>().nested_field_depth;
-                    default:
-                        return static_cast<uint8_t>(0);
-                }
-            };
-
-            if constexpr (std::is_same_v<StorageType, Object>)
-                for (auto & [_, value] : x)
-                    result = std::max(get_depth(value), result);
-            else
-                for (auto & value : x)
-                    result = std::max(get_depth(value), result);
-        }
-
-        if (result >= DBMS_MAX_NESTED_FIELD_DEPTH)
-            throw Exception(ErrorCodes::TOO_DEEP_RECURSION, "Too deep Field");
-
-        return result;
-    }
 
     /// Assuming there was no allocated state or it was deallocated (see destroy).
     template <typename T>
@@ -738,17 +684,7 @@ class Field
         // we must initialize the entire wide stored type, and not just the
         // nominal type.
         using StorageType = NearestFieldType<UnqualifiedType>;
-
-        /// Incrementing the depth since we create a new Field.
-        auto depth = calculateAndCheckFieldDepth<StorageType>(x);
         new (&storage) StorageType(std::forward<T>(x));
-
-        if constexpr (std::is_same_v<StorageType, Array>
-            || std::is_same_v<StorageType, Tuple>
-            || std::is_same_v<StorageType, Map>
-            || std::is_same_v<StorageType, Object>)
-            reinterpret_cast<StorageType *>(&storage)->nested_field_depth = depth + 1;
-
         which = TypeToEnum<UnqualifiedType>::value;
     }
 
@@ -845,7 +781,7 @@ class Field
     }
 
     template <typename T>
-    ALWAYS_INLINE void destroy()
+    void destroy()
     {
         T * MAY_ALIAS ptr = reinterpret_cast<T*>(&storage);
         ptr->~T();

diff --git a/src/DataTypes/DataTypeFactory.cpp b/src/DataTypes/DataTypeFactory.cpp
@@ -62,7 +62,7 @@ DataTypePtr DataTypeFactory::getImpl(const String & full_name) const
     }
     else
     {
-        ast = parseQuery(parser, full_name.data(), full_name.data() + full_name.size(), "data type", DBMS_DEFAULT_MAX_QUERY_SIZE, data_type_max_parse_depth);
+        ast = parseQuery(parser, full_name.data(), full_name.data() + full_name.size(), "data type", false, data_type_max_parse_depth);
     }
 
     return getImpl<nullptr_on_error>(ast);

diff --git a/src/Functions/DateTimeTransforms.h b/src/Functions/DateTimeTransforms.h
@@ -1521,10 +1521,8 @@ struct Transformer
                 if constexpr (std::is_same_v<Additions, DateTimeAccurateConvertStrategyAdditions>
                     || std::is_same_v<Additions, DateTimeAccurateOrNullConvertStrategyAdditions>)
                 {
-#   pragma clang diagnostic push
-#   pragma clang diagnostic ignored "-Wimplicit-const-int-float-conversion"
                     bool is_valid_input = vec_from[i] >= 0 && vec_from[i] <= 0xFFFFFFFFL;
-#   pragma clang diagnostic pop
+
                     if (!is_valid_input)
                     {
                         if constexpr (std::is_same_v<Additions, DateTimeAccurateOrNullConvertStrategyAdditions>)

diff --git a/src/Functions/LowerUpperUTF8Impl.h b/src/Functions/LowerUpperUTF8Impl.h
@@ -133,8 +133,6 @@ struct LowerUpperUTF8Impl
         }
         else
         {
-            static const Poco::UTF8Encoding utf8;
-
             size_t src_sequence_length = UTF8::seqLength(*src);
             /// In case partial buffer was passed (due to SSE optimization)
             /// we cannot convert it with current src_end, but we may have more

diff --git a/src/Functions/initcap.cpp b/src/Functions/initcap.cpp
@@ -0,0 +1,66 @@
+#include <Functions/FunctionFactory.h>
+#include <Functions/FunctionStringToString.h>
+#include <Common/StringUtils/StringUtils.h>
+
+namespace DB
+{
+namespace
+{
+
+struct InitcapImpl
+{
+    static void vector(const ColumnString::Chars & data,
+        const ColumnString::Offsets & offsets,
+        ColumnString::Chars & res_data,
+        ColumnString::Offsets & res_offsets)
+    {
+        if (data.empty())
+            return;
+        res_data.resize(data.size());
+        res_offsets.assign(offsets);
+        array(data.data(), data.data() + data.size(), res_data.data());
+    }
+
+    static void vectorFixed(const ColumnString::Chars & data, size_t /*n*/, ColumnString::Chars & res_data)
+    {
+        res_data.resize(data.size());
+        array(data.data(), data.data() + data.size(), res_data.data());
+    }
+
+private:
+    static void array(const UInt8 * src, const UInt8 * src_end, UInt8 * dst)
+    {
+        bool prev_alphanum = false;
+
+        for (; src < src_end; ++src, ++dst)
+        {
+            char c = *src;
+            bool alphanum = isAlphaNumericASCII(c);
+            if (alphanum && !prev_alphanum)
+                if (isAlphaASCII(c))
+                    *dst = toUpperIfAlphaASCII(c);
+                else
+                    *dst = c;
+            else if (isAlphaASCII(c))
+                *dst = toLowerIfAlphaASCII(c);
+            else
+                *dst = c;
+            prev_alphanum = alphanum;
+        }
+    }
+};
+
+struct NameInitcap
+{
+    static constexpr auto name = "initcap";
+};
+using FunctionInitcap = FunctionStringToString<InitcapImpl, NameInitcap>;
+
+}
+
+REGISTER_FUNCTION(Initcap)
+{
+    factory.registerFunction<FunctionInitcap>({}, FunctionFactory::CaseInsensitive);
+}
+
+}