Skip to content

Commit

Permalink
Merge branch 'master' into zvonand-issue-49290
Browse files Browse the repository at this point in the history
  • Loading branch information
alexey-milovidov committed Jul 17, 2023
2 parents d339c22 + 9f45513 commit 4884022
Show file tree
Hide file tree
Showing 24 changed files with 282 additions and 93 deletions.
12 changes: 12 additions & 0 deletions docs/en/sql-reference/functions/string-functions.md
Expand Up @@ -1255,3 +1255,15 @@ Result:
│ A240 │
└──────────────────┘
```

## initcap

Convert the first letter of each word to upper case and the rest to lower case. Words are sequences of alphanumeric characters separated by non-alphanumeric characters.

## initcapUTF8

Like [initcap](#initcap), assuming that the string contains valid UTF-8 encoded text. If this assumption is violated, no exception is thrown and the result is undefined.

Does not detect the language, e.g. for Turkish the result might not be exactly correct (i/İ vs. i/I).

If the length of the UTF-8 byte sequence is different for upper and lower case of a code point, the result may be incorrect for this code point.
11 changes: 11 additions & 0 deletions docs/ru/sql-reference/functions/string-functions.md
Expand Up @@ -1113,3 +1113,14 @@ A text with tags .
The content within <b>CDATA</b>
Do Nothing for 2 Minutes 2:00 &nbsp;
```

## initcap {#initcap}

Переводит первую букву каждого слова в строке в верхний регистр, а остальные — в нижний. Словами считаются последовательности алфавитно-цифровых символов, разделённые любыми другими символами.

## initcapUTF8 {#initcapUTF8}

Как [initcap](#initcap), предполагая, что строка содержит набор байтов, представляющий текст в кодировке UTF-8.
Не учитывает язык. То есть, для турецкого языка, результат может быть не совсем верным.
Если длина UTF-8 последовательности байтов различна для верхнего и нижнего регистра кодовой точки, то для этой кодовой точки результат работы может быть некорректным.
Если строка содержит набор байтов, не являющийся UTF-8, то поведение не определено.
Expand Up @@ -8,7 +8,6 @@

#include <Common/MemoryTracker.h>
#include <Common/CurrentThread.h>
#include <Common/Arena.h>

#include <Interpreters/Context.h>

Expand Down
6 changes: 5 additions & 1 deletion src/Analyzer/Passes/QueryAnalysisPass.cpp
Expand Up @@ -6223,7 +6223,11 @@ void QueryAnalyzer::resolveTableFunction(QueryTreeNodePtr & table_function_node,
const auto & insertion_table = scope_context->getInsertionTable();
if (!insertion_table.empty())
{
const auto & insert_structure = DatabaseCatalog::instance().getTable(insertion_table, scope_context)->getInMemoryMetadataPtr()->getColumns();
const auto & insert_structure = DatabaseCatalog::instance()
.getTable(insertion_table, scope_context)
->getInMemoryMetadataPtr()
->getColumns()
.getInsertable();
DB::ColumnsDescription structure_hint;

bool use_columns_from_insert_query = true;
Expand Down
2 changes: 1 addition & 1 deletion src/Compression/fuzzers/delta_decompress_fuzzer.cpp
Expand Up @@ -34,7 +34,7 @@ try
DB::Memory<> memory;
memory.resize(output_buffer_size + codec->getAdditionalSizeAtTheEndOfBuffer());

codec->doDecompressData(reinterpret_cast<const char *>(data), static_cast<UInt32>(size), memory.data(), static_cast<UInt32>(output_buffer_size));
codec->doDecompressData(reinterpret_cast<const char *>(data), size, memory.data(), output_buffer_size);

return 0;
}
Expand Down
2 changes: 1 addition & 1 deletion src/Compression/fuzzers/double_delta_decompress_fuzzer.cpp
Expand Up @@ -34,7 +34,7 @@ try
DB::Memory<> memory;
memory.resize(output_buffer_size + codec->getAdditionalSizeAtTheEndOfBuffer());

codec->doDecompressData(reinterpret_cast<const char *>(data), static_cast<UInt32>(size), memory.data(), static_cast<UInt32>(output_buffer_size));
codec->doDecompressData(reinterpret_cast<const char *>(data), size, memory.data(), output_buffer_size);

return 0;
}
Expand Down
4 changes: 2 additions & 2 deletions src/Compression/fuzzers/encrypted_decompress_fuzzer.cpp
Expand Up @@ -292,10 +292,10 @@ try

DB::Memory<> memory;
memory.resize(input.size() + codec_128->getAdditionalSizeAtTheEndOfBuffer());
codec_128->doDecompressData(input.data(), static_cast<UInt32>(input.size()), memory.data(), static_cast<UInt32>(input.size() - 31));
codec_128->doDecompressData(input.data(), input.size(), memory.data(), input.size() - 31);

memory.resize(input.size() + codec_128->getAdditionalSizeAtTheEndOfBuffer());
codec_256->doDecompressData(input.data(), static_cast<UInt32>(input.size()), memory.data(), static_cast<UInt32>(input.size() - 31));
codec_256->doDecompressData(input.data(), input.size(), memory.data(), input.size() - 31);
return 0;
}
catch (...)
Expand Down
4 changes: 2 additions & 2 deletions src/Compression/fuzzers/lz4_decompress_fuzzer.cpp
Expand Up @@ -24,7 +24,7 @@ try
return 0;

const auto * p = reinterpret_cast<const AuxiliaryRandomData *>(data);
auto codec = DB::getCompressionCodecLZ4(static_cast<int>(p->level));
auto codec = DB::getCompressionCodecLZ4(p->level);

size_t output_buffer_size = p->decompressed_size % 65536;
size -= sizeof(AuxiliaryRandomData);
Expand All @@ -37,7 +37,7 @@ try
DB::Memory<> memory;
memory.resize(output_buffer_size + LZ4::ADDITIONAL_BYTES_AT_END_OF_BUFFER);

codec->doDecompressData(reinterpret_cast<const char *>(data), static_cast<UInt32>(size), memory.data(), static_cast<UInt32>(output_buffer_size));
codec->doDecompressData(reinterpret_cast<const char *>(data), size, memory.data(), output_buffer_size);

return 0;
}
Expand Down
66 changes: 1 addition & 65 deletions src/Core/Field.h
Expand Up @@ -28,7 +28,6 @@ namespace ErrorCodes
extern const int NOT_IMPLEMENTED;
extern const int LOGICAL_ERROR;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int TOO_DEEP_RECURSION;
}

constexpr Null NEGATIVE_INFINITY{Null::Value::NegativeInfinity};
Expand All @@ -42,13 +41,10 @@ using FieldVector = std::vector<Field, AllocatorWithMemoryTracking<Field>>;
/// construct a Field of Array or a Tuple type. An alternative approach would be
/// to construct both of these types from FieldVector, and have the caller
/// specify the desired Field type explicitly.
/// As the result stack overflow on destruction is possible
/// and to avoid it we need to count the depth and have a threshold.
#define DEFINE_FIELD_VECTOR(X) \
struct X : public FieldVector \
{ \
using FieldVector::FieldVector; \
uint8_t nested_field_depth = 0; \
}

DEFINE_FIELD_VECTOR(Array);
Expand All @@ -65,7 +61,6 @@ using FieldMap = std::map<String, Field, std::less<>, AllocatorWithMemoryTrackin
struct X : public FieldMap \
{ \
using FieldMap::FieldMap; \
uint8_t nested_field_depth = 0; \
}

DEFINE_FIELD_MAP(Object);
Expand Down Expand Up @@ -296,12 +291,6 @@ decltype(auto) castToNearestFieldType(T && x)
*/
#define DBMS_MIN_FIELD_SIZE 32

/// Note: uint8_t is used for storing depth value.
#if defined(SANITIZER) || !defined(NDEBUG)
#define DBMS_MAX_NESTED_FIELD_DEPTH 64
#else
#define DBMS_MAX_NESTED_FIELD_DEPTH 255
#endif

/** Discriminated union of several types.
* Made for replacement of `boost::variant`
Expand Down Expand Up @@ -682,49 +671,6 @@ class Field

Types::Which which;

/// StorageType and Original are the same for Array, Tuple, Map, Object
template <typename StorageType, typename Original>
uint8_t calculateAndCheckFieldDepth(Original && x)
{
uint8_t result = 0;

if constexpr (std::is_same_v<StorageType, Array>
|| std::is_same_v<StorageType, Tuple>
|| std::is_same_v<StorageType, Map>
|| std::is_same_v<StorageType, Object>)
{
result = x.nested_field_depth;

auto get_depth = [](const Field & elem)
{
switch (elem.which)
{
case Types::Array:
return elem.template get<Array>().nested_field_depth;
case Types::Tuple:
return elem.template get<Tuple>().nested_field_depth;
case Types::Map:
return elem.template get<Map>().nested_field_depth;
case Types::Object:
return elem.template get<Object>().nested_field_depth;
default:
return static_cast<uint8_t>(0);
}
};

if constexpr (std::is_same_v<StorageType, Object>)
for (auto & [_, value] : x)
result = std::max(get_depth(value), result);
else
for (auto & value : x)
result = std::max(get_depth(value), result);
}

if (result >= DBMS_MAX_NESTED_FIELD_DEPTH)
throw Exception(ErrorCodes::TOO_DEEP_RECURSION, "Too deep Field");

return result;
}

/// Assuming there was no allocated state or it was deallocated (see destroy).
template <typename T>
Expand All @@ -738,17 +684,7 @@ class Field
// we must initialize the entire wide stored type, and not just the
// nominal type.
using StorageType = NearestFieldType<UnqualifiedType>;

/// Incrementing the depth since we create a new Field.
auto depth = calculateAndCheckFieldDepth<StorageType>(x);
new (&storage) StorageType(std::forward<T>(x));

if constexpr (std::is_same_v<StorageType, Array>
|| std::is_same_v<StorageType, Tuple>
|| std::is_same_v<StorageType, Map>
|| std::is_same_v<StorageType, Object>)
reinterpret_cast<StorageType *>(&storage)->nested_field_depth = depth + 1;

which = TypeToEnum<UnqualifiedType>::value;
}

Expand Down Expand Up @@ -845,7 +781,7 @@ class Field
}

template <typename T>
ALWAYS_INLINE void destroy()
void destroy()
{
T * MAY_ALIAS ptr = reinterpret_cast<T*>(&storage);
ptr->~T();
Expand Down
2 changes: 1 addition & 1 deletion src/DataTypes/DataTypeFactory.cpp
Expand Up @@ -62,7 +62,7 @@ DataTypePtr DataTypeFactory::getImpl(const String & full_name) const
}
else
{
ast = parseQuery(parser, full_name.data(), full_name.data() + full_name.size(), "data type", DBMS_DEFAULT_MAX_QUERY_SIZE, data_type_max_parse_depth);
ast = parseQuery(parser, full_name.data(), full_name.data() + full_name.size(), "data type", false, data_type_max_parse_depth);
}

return getImpl<nullptr_on_error>(ast);
Expand Down
4 changes: 1 addition & 3 deletions src/Functions/DateTimeTransforms.h
Expand Up @@ -1521,10 +1521,8 @@ struct Transformer
if constexpr (std::is_same_v<Additions, DateTimeAccurateConvertStrategyAdditions>
|| std::is_same_v<Additions, DateTimeAccurateOrNullConvertStrategyAdditions>)
{
# pragma clang diagnostic push
# pragma clang diagnostic ignored "-Wimplicit-const-int-float-conversion"
bool is_valid_input = vec_from[i] >= 0 && vec_from[i] <= 0xFFFFFFFFL;
# pragma clang diagnostic pop

if (!is_valid_input)
{
if constexpr (std::is_same_v<Additions, DateTimeAccurateOrNullConvertStrategyAdditions>)
Expand Down
2 changes: 0 additions & 2 deletions src/Functions/LowerUpperUTF8Impl.h
Expand Up @@ -133,8 +133,6 @@ struct LowerUpperUTF8Impl
}
else
{
static const Poco::UTF8Encoding utf8;

size_t src_sequence_length = UTF8::seqLength(*src);
/// In case partial buffer was passed (due to SSE optimization)
/// we cannot convert it with current src_end, but we may have more
Expand Down
66 changes: 66 additions & 0 deletions src/Functions/initcap.cpp
@@ -0,0 +1,66 @@
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionStringToString.h>
#include <Common/StringUtils/StringUtils.h>

namespace DB
{
namespace
{

struct InitcapImpl
{
static void vector(const ColumnString::Chars & data,
const ColumnString::Offsets & offsets,
ColumnString::Chars & res_data,
ColumnString::Offsets & res_offsets)
{
if (data.empty())
return;
res_data.resize(data.size());
res_offsets.assign(offsets);
array(data.data(), data.data() + data.size(), res_data.data());
}

static void vectorFixed(const ColumnString::Chars & data, size_t /*n*/, ColumnString::Chars & res_data)
{
res_data.resize(data.size());
array(data.data(), data.data() + data.size(), res_data.data());
}

private:
static void array(const UInt8 * src, const UInt8 * src_end, UInt8 * dst)
{
bool prev_alphanum = false;

for (; src < src_end; ++src, ++dst)
{
char c = *src;
bool alphanum = isAlphaNumericASCII(c);
if (alphanum && !prev_alphanum)
if (isAlphaASCII(c))
*dst = toUpperIfAlphaASCII(c);
else
*dst = c;
else if (isAlphaASCII(c))
*dst = toLowerIfAlphaASCII(c);
else
*dst = c;
prev_alphanum = alphanum;
}
}
};

struct NameInitcap
{
static constexpr auto name = "initcap";
};
using FunctionInitcap = FunctionStringToString<InitcapImpl, NameInitcap>;

}

REGISTER_FUNCTION(Initcap)
{
factory.registerFunction<FunctionInitcap>({}, FunctionFactory::CaseInsensitive);
}

}

0 comments on commit 4884022

Please sign in to comment.