Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add new function startsWithUTF8 and endsWithUTF8 #52555

Merged
merged 10 commits into from Aug 10, 2023
28 changes: 28 additions & 0 deletions docs/en/sql-reference/functions/string-functions.md
Expand Up @@ -729,6 +729,22 @@ Returns whether string `str` ends with `suffix`.
endsWith(str, suffix)
```

## endsWithUTF8
taiyang-li marked this conversation as resolved.
Show resolved Hide resolved

Returns whether string `str` ends with `suffix`, the difference between `endsWithUTF8` and `endsWith` is that `endsWithUTF8` match `str` and `suffix` by UTF-8 characters.

**Syntax**

```sql
endsWithUTF8(str, suffix)
```

**Example**

``` sql
SELECT endsWithUTF8('中国', '国');
```

## startsWith

Returns whether string `str` starts with `prefix`.
Expand All @@ -745,6 +761,18 @@ startsWith(str, prefix)
SELECT startsWith('Spider-Man', 'Spi');
```

## startsWithUTF8

Returns whether string `str` starts with `prefix`, the difference between `startsWithUTF8` and `startsWith` is that `startsWithUTF8` match `str` and `suffix` by UTF-8 characters.


**Example**

``` sql
SELECT startsWithUTF8('中国', '中');
```


## trim

Removes the specified characters from the start or end of a string. If not specified otherwise, the function removes whitespace (ASCII-character 32).
Expand Down
78 changes: 68 additions & 10 deletions src/Functions/FunctionStartsEndsWith.h
Expand Up @@ -28,10 +28,24 @@ namespace ErrorCodes
struct NameStartsWith
{
static constexpr auto name = "startsWith";
static constexpr auto is_utf8 = false;
};
struct NameEndsWith
{
static constexpr auto name = "endsWith";
static constexpr auto is_utf8 = false;
};

struct NameStartsWithUTF8
{
static constexpr auto name = "startsWithUTF8";
static constexpr auto is_utf8 = true;
};

struct NameEndsWithUTF8
{
static constexpr auto name = "endsWithUTF8";
static constexpr auto is_utf8 = true;
};

DECLARE_MULTITARGET_CODE(
Expand All @@ -41,6 +55,7 @@ class FunctionStartsEndsWith : public IFunction
{
public:
static constexpr auto name = Name::name;
static constexpr auto is_utf8 = Name::is_utf8;

String getName() const override
{
Expand All @@ -64,7 +79,8 @@ class FunctionStartsEndsWith : public IFunction

DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (isStringOrFixedString(arguments[0]) && isStringOrFixedString(arguments[1]))
if (!is_utf8 && isStringOrFixedString(arguments[0]) && isStringOrFixedString(arguments[1])
|| isString(arguments[0]) && isString(arguments[1]))
return std::make_shared<DataTypeUInt8>();

if (isArray(arguments[0]) && isArray(arguments[1]))
Expand All @@ -78,8 +94,11 @@ class FunctionStartsEndsWith : public IFunction
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
{
auto data_type = arguments[0].type;
if (isStringOrFixedString(*data_type))

if (!is_utf8 && isStringOrFixedString(*data_type))
return executeImplString(arguments, {}, input_rows_count);
if (is_utf8 && isString(*data_type))
return executeImplStringUTF8(arguments, {}, input_rows_count);
if (isArray(data_type))
return executeImplArray(arguments, {}, input_rows_count);
return {};
Expand Down Expand Up @@ -131,7 +150,6 @@ class FunctionStartsEndsWith : public IFunction
typename ColumnVector<UInt8>::Container & vec_res = col_res->getData();

vec_res.resize(input_rows_count);

if (const ColumnString * haystack = checkAndGetColumn<ColumnString>(haystack_column))
dispatch<StringSource>(StringSource(*haystack), needle_column, vec_res);
else if (const ColumnFixedString * haystack_fixed = checkAndGetColumn<ColumnFixedString>(haystack_column))
Expand All @@ -146,6 +164,26 @@ class FunctionStartsEndsWith : public IFunction
return col_res;
}

ColumnPtr executeImplStringUTF8(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const
{
const IColumn * haystack_column = arguments[0].column.get();
const IColumn * needle_column = arguments[1].column.get();

auto col_res = ColumnVector<UInt8>::create();
typename ColumnVector<UInt8>::Container & vec_res = col_res->getData();

vec_res.resize(input_rows_count);
if (const ColumnString * haystack = checkAndGetColumn<ColumnString>(haystack_column))
dispatchUTF8<UTF8StringSource>(UTF8StringSource(*haystack), needle_column, vec_res);
else if (const ColumnConst * haystack_const = checkAndGetColumnConst<ColumnString>(haystack_column))
dispatchUTF8<ConstSource<UTF8StringSource>>(ConstSource<UTF8StringSource>(*haystack_const), needle_column, vec_res);
else
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal combination of columns as arguments of function {}", getName());

return col_res;
}


template <typename HaystackSource>
void dispatch(HaystackSource haystack_source, const IColumn * needle_column, PaddedPODArray<UInt8> & res_data) const
{
Expand All @@ -161,6 +199,17 @@ class FunctionStartsEndsWith : public IFunction
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal combination of columns as arguments of function {}", getName());
}

template <typename HaystackSource>
void dispatchUTF8(HaystackSource haystack_source, const IColumn * needle_column, PaddedPODArray<UInt8> & res_data) const
{
if (const ColumnString * needle = checkAndGetColumn<ColumnString>(needle_column))
execute<HaystackSource, UTF8StringSource>(haystack_source, UTF8StringSource(*needle), res_data);
else if (const ColumnConst * needle_const = checkAndGetColumnConst<ColumnString>(needle_column))
execute<HaystackSource, ConstSource<UTF8StringSource>>(haystack_source, ConstSource<UTF8StringSource>(*needle_const), res_data);
else
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal combination of columns as arguments of function {}", getName());
}

template <typename HaystackSource, typename NeedleSource>
static void execute(HaystackSource haystack_source, NeedleSource needle_source, PaddedPODArray<UInt8> & res_data)
{
Expand All @@ -172,18 +221,27 @@ class FunctionStartsEndsWith : public IFunction
auto needle = needle_source.getWhole();

if (needle.size > haystack.size)
{
res_data[row_num] = false;
}
else
{
if constexpr (std::is_same_v<Name, NameStartsWith>)
{
if constexpr (std::is_same_v<Name, NameStartsWith>) /// startsWith
res_data[row_num] = StringRef(haystack.data, needle.size) == StringRef(needle.data, needle.size);
}
else /// endsWith
{
else if constexpr (std::is_same_v<Name, NameEndsWith>) /// endsWith
res_data[row_num] = StringRef(haystack.data + haystack.size - needle.size, needle.size) == StringRef(needle.data, needle.size);
else /// startsWithUTF8 or endsWithUTF8
{
auto length = UTF8::countCodePoints(needle.data, needle.size);

if constexpr (std::is_same_v<Name, NameStartsWithUTF8>)
{
auto slice = haystack_source.getSliceFromLeft(0, length);
res_data[row_num] = StringRef(slice.data, slice.size) == StringRef(needle.data, needle.size);
}
else
{
auto slice = haystack_source.getSliceFromRight(length);
res_data[row_num] = StringRef(slice.data, slice.size) == StringRef(needle.data, needle.size);
}
}
}

Expand Down
21 changes: 21 additions & 0 deletions src/Functions/endsWithUTF8.cpp
@@ -0,0 +1,21 @@
#include <DataTypes/DataTypeString.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionStartsEndsWith.h>


namespace DB
{

using FunctionEndsWithUTF8 = FunctionStartsEndsWith<NameEndsWithUTF8>;

REGISTER_FUNCTION(EndsWithUTF8)
{
factory.registerFunction<FunctionEndsWithUTF8>(FunctionDocumentation{
.description = R"(
Returns whether string `str` ends with `suffix`, the difference between `endsWithUTF8` and `endsWith` is that `endsWithUTF8` match `str` and `suffix` by UTF-8 characters.
)",
.examples{{"endsWithUTF8", "select endsWithUTF8('富强民主文明和谐', '富强');", ""}},
.categories{"String"}});
}

}
21 changes: 21 additions & 0 deletions src/Functions/startsWithUTF8.cpp
@@ -0,0 +1,21 @@
#include <DataTypes/DataTypeString.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionStartsEndsWith.h>


namespace DB
{

using FunctionStartsWithUTF8 = FunctionStartsEndsWith<NameStartsWithUTF8>;

REGISTER_FUNCTION(StartsWithUTF8)
{
factory.registerFunction<FunctionStartsWithUTF8>(FunctionDocumentation{
.description = R"(
Returns whether string `str` starts with `prefix`, the difference between `startsWithUTF8` and `startsWith` is that `startsWithUTF8` match `str` and `suffix` by UTF-8 characters.
)",
.examples{{"startsWithUTF8", "select startsWithUTF8('富强民主文明和谐', '富强');", ""}},
.categories{"String"}});
}

}
29 changes: 29 additions & 0 deletions tests/queries/0_stateless/02833_starts_ends_with_utf8.reference
@@ -0,0 +1,29 @@
-- { echoOn }
select startsWithUTF8('富强民主文明和谐', '富强');
1
select startsWithUTF8('富强民主文明和谐', '\xe5');
0
select startsWithUTF8('富强民主文明和谐', '');
1
SELECT startsWithUTF8('123', '123');
1
SELECT startsWithUTF8('123', '12');
1
SELECT startsWithUTF8('123', '1234');
0
SELECT startsWithUTF8('123', '');
1
select endsWithUTF8('富强民主文明和谐', '和谐');
1
select endsWithUTF8('富强民主文明和谐', '\x90');
0
select endsWithUTF8('富强民主文明和谐', '');
1
SELECT endsWithUTF8('123', '3');
1
SELECT endsWithUTF8('123', '23');
1
SELECT endsWithUTF8('123', '32');
0
SELECT endsWithUTF8('123', '');
1
19 changes: 19 additions & 0 deletions tests/queries/0_stateless/02833_starts_ends_with_utf8.sql
@@ -0,0 +1,19 @@
-- { echoOn }
select startsWithUTF8('富强民主文明和谐', '富强');
select startsWithUTF8('富强民主文明和谐', '\xe5');
select startsWithUTF8('富强民主文明和谐', '');

SELECT startsWithUTF8('123', '123');
SELECT startsWithUTF8('123', '12');
SELECT startsWithUTF8('123', '1234');
SELECT startsWithUTF8('123', '');

select endsWithUTF8('富强民主文明和谐', '和谐');
select endsWithUTF8('富强民主文明和谐', '\x90');
select endsWithUTF8('富强民主文明和谐', '');

SELECT endsWithUTF8('123', '3');
SELECT endsWithUTF8('123', '23');
SELECT endsWithUTF8('123', '32');
SELECT endsWithUTF8('123', '');
-- { echoOff }
2 changes: 2 additions & 0 deletions utils/check-style/aspell-ignore/en/aspell-dict.txt
Expand Up @@ -1397,6 +1397,7 @@ encodings
encryptions
endian
endsWith
endsWithUTF
enum
enum's
enums
Expand Down Expand Up @@ -2203,6 +2204,7 @@ src
stacktrace
stacktraces
startsWith
startsWithUTF
statbox
stateful
stddev
Expand Down