Skip to content

Commit

Permalink
Merge pull request #52555 from bigo-sg/starts_ends_with_utf8
Browse files Browse the repository at this point in the history
Add new function startsWithUTF8 and endsWithUTF8
  • Loading branch information
Avogar committed Aug 10, 2023
2 parents 57025ee + d15ae5e commit 14aad35
Show file tree
Hide file tree
Showing 7 changed files with 203 additions and 10 deletions.
43 changes: 43 additions & 0 deletions docs/en/sql-reference/functions/string-functions.md
Expand Up @@ -729,6 +729,30 @@ Returns whether string `str` ends with `suffix`.
endsWith(str, suffix)
```

## endsWithUTF8

Returns whether string `str` ends with `suffix`, the difference between `endsWithUTF8` and `endsWith` is that `endsWithUTF8` match `str` and `suffix` by UTF-8 characters.

**Syntax**

```sql
endsWithUTF8(str, suffix)
```

**Example**

``` sql
SELECT endsWithUTF8('中国', '\xbd'), endsWith('中国', '\xbd')
```

Result:

```result
┌─endsWithUTF8('中国', '½')─┬─endsWith('中国', '½')─┐
│ 0 │ 1 │
└──────────────────────────┴──────────────────────┘
```

## startsWith

Returns whether string `str` starts with `prefix`.
Expand All @@ -745,6 +769,25 @@ startsWith(str, prefix)
SELECT startsWith('Spider-Man', 'Spi');
```

## startsWithUTF8

Returns whether string `str` starts with `prefix`, the difference between `startsWithUTF8` and `startsWith` is that `startsWithUTF8` match `str` and `suffix` by UTF-8 characters.


**Example**

``` sql
SELECT startsWithUTF8('中国', '\xe4'), startsWith('中国', '\xe4')
```

Result:

```result
┌─startsWithUTF8('中国', '⥩─┬─startsWith('中国', '⥩─┐
│ 0 │ 1 │
└────────────────────────────┴────────────────────────┘
```

## trim

Removes the specified characters from the start or end of a string. If not specified otherwise, the function removes whitespace (ASCII-character 32).
Expand Down
78 changes: 68 additions & 10 deletions src/Functions/FunctionStartsEndsWith.h
Expand Up @@ -28,10 +28,24 @@ namespace ErrorCodes
struct NameStartsWith
{
static constexpr auto name = "startsWith";
static constexpr auto is_utf8 = false;
};
struct NameEndsWith
{
static constexpr auto name = "endsWith";
static constexpr auto is_utf8 = false;
};

struct NameStartsWithUTF8
{
static constexpr auto name = "startsWithUTF8";
static constexpr auto is_utf8 = true;
};

struct NameEndsWithUTF8
{
static constexpr auto name = "endsWithUTF8";
static constexpr auto is_utf8 = true;
};

DECLARE_MULTITARGET_CODE(
Expand All @@ -41,6 +55,7 @@ class FunctionStartsEndsWith : public IFunction
{
public:
static constexpr auto name = Name::name;
static constexpr auto is_utf8 = Name::is_utf8;

String getName() const override
{
Expand All @@ -64,7 +79,8 @@ class FunctionStartsEndsWith : public IFunction

DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (isStringOrFixedString(arguments[0]) && isStringOrFixedString(arguments[1]))
if (!is_utf8 && isStringOrFixedString(arguments[0]) && isStringOrFixedString(arguments[1])
|| isString(arguments[0]) && isString(arguments[1]))
return std::make_shared<DataTypeUInt8>();

if (isArray(arguments[0]) && isArray(arguments[1]))
Expand All @@ -78,8 +94,11 @@ class FunctionStartsEndsWith : public IFunction
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
{
auto data_type = arguments[0].type;
if (isStringOrFixedString(*data_type))

if (!is_utf8 && isStringOrFixedString(*data_type))
return executeImplString(arguments, {}, input_rows_count);
if (is_utf8 && isString(*data_type))
return executeImplStringUTF8(arguments, {}, input_rows_count);
if (isArray(data_type))
return executeImplArray(arguments, {}, input_rows_count);
return {};
Expand Down Expand Up @@ -131,7 +150,6 @@ class FunctionStartsEndsWith : public IFunction
typename ColumnVector<UInt8>::Container & vec_res = col_res->getData();

vec_res.resize(input_rows_count);

if (const ColumnString * haystack = checkAndGetColumn<ColumnString>(haystack_column))
dispatch<StringSource>(StringSource(*haystack), needle_column, vec_res);
else if (const ColumnFixedString * haystack_fixed = checkAndGetColumn<ColumnFixedString>(haystack_column))
Expand All @@ -146,6 +164,26 @@ class FunctionStartsEndsWith : public IFunction
return col_res;
}

ColumnPtr executeImplStringUTF8(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const
{
const IColumn * haystack_column = arguments[0].column.get();
const IColumn * needle_column = arguments[1].column.get();

auto col_res = ColumnVector<UInt8>::create();
typename ColumnVector<UInt8>::Container & vec_res = col_res->getData();

vec_res.resize(input_rows_count);
if (const ColumnString * haystack = checkAndGetColumn<ColumnString>(haystack_column))
dispatchUTF8<UTF8StringSource>(UTF8StringSource(*haystack), needle_column, vec_res);
else if (const ColumnConst * haystack_const = checkAndGetColumnConst<ColumnString>(haystack_column))
dispatchUTF8<ConstSource<UTF8StringSource>>(ConstSource<UTF8StringSource>(*haystack_const), needle_column, vec_res);
else
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal combination of columns as arguments of function {}", getName());

return col_res;
}


template <typename HaystackSource>
void dispatch(HaystackSource haystack_source, const IColumn * needle_column, PaddedPODArray<UInt8> & res_data) const
{
Expand All @@ -161,6 +199,17 @@ class FunctionStartsEndsWith : public IFunction
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal combination of columns as arguments of function {}", getName());
}

template <typename HaystackSource>
void dispatchUTF8(HaystackSource haystack_source, const IColumn * needle_column, PaddedPODArray<UInt8> & res_data) const
{
if (const ColumnString * needle = checkAndGetColumn<ColumnString>(needle_column))
execute<HaystackSource, UTF8StringSource>(haystack_source, UTF8StringSource(*needle), res_data);
else if (const ColumnConst * needle_const = checkAndGetColumnConst<ColumnString>(needle_column))
execute<HaystackSource, ConstSource<UTF8StringSource>>(haystack_source, ConstSource<UTF8StringSource>(*needle_const), res_data);
else
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal combination of columns as arguments of function {}", getName());
}

template <typename HaystackSource, typename NeedleSource>
static void execute(HaystackSource haystack_source, NeedleSource needle_source, PaddedPODArray<UInt8> & res_data)
{
Expand All @@ -172,18 +221,27 @@ class FunctionStartsEndsWith : public IFunction
auto needle = needle_source.getWhole();

if (needle.size > haystack.size)
{
res_data[row_num] = false;
}
else
{
if constexpr (std::is_same_v<Name, NameStartsWith>)
{
if constexpr (std::is_same_v<Name, NameStartsWith>) /// startsWith
res_data[row_num] = StringRef(haystack.data, needle.size) == StringRef(needle.data, needle.size);
}
else /// endsWith
{
else if constexpr (std::is_same_v<Name, NameEndsWith>) /// endsWith
res_data[row_num] = StringRef(haystack.data + haystack.size - needle.size, needle.size) == StringRef(needle.data, needle.size);
else /// startsWithUTF8 or endsWithUTF8
{
auto length = UTF8::countCodePoints(needle.data, needle.size);

if constexpr (std::is_same_v<Name, NameStartsWithUTF8>)
{
auto slice = haystack_source.getSliceFromLeft(0, length);
res_data[row_num] = StringRef(slice.data, slice.size) == StringRef(needle.data, needle.size);
}
else
{
auto slice = haystack_source.getSliceFromRight(length);
res_data[row_num] = StringRef(slice.data, slice.size) == StringRef(needle.data, needle.size);
}
}
}

Expand Down
21 changes: 21 additions & 0 deletions src/Functions/endsWithUTF8.cpp
@@ -0,0 +1,21 @@
#include <DataTypes/DataTypeString.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionStartsEndsWith.h>


namespace DB
{

using FunctionEndsWithUTF8 = FunctionStartsEndsWith<NameEndsWithUTF8>;

REGISTER_FUNCTION(EndsWithUTF8)
{
factory.registerFunction<FunctionEndsWithUTF8>(FunctionDocumentation{
.description = R"(
Returns whether string `str` ends with `suffix`, the difference between `endsWithUTF8` and `endsWith` is that `endsWithUTF8` match `str` and `suffix` by UTF-8 characters.
)",
.examples{{"endsWithUTF8", "select endsWithUTF8('富强民主文明和谐', '富强');", ""}},
.categories{"String"}});
}

}
21 changes: 21 additions & 0 deletions src/Functions/startsWithUTF8.cpp
@@ -0,0 +1,21 @@
#include <DataTypes/DataTypeString.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionStartsEndsWith.h>


namespace DB
{

using FunctionStartsWithUTF8 = FunctionStartsEndsWith<NameStartsWithUTF8>;

REGISTER_FUNCTION(StartsWithUTF8)
{
factory.registerFunction<FunctionStartsWithUTF8>(FunctionDocumentation{
.description = R"(
Returns whether string `str` starts with `prefix`, the difference between `startsWithUTF8` and `startsWith` is that `startsWithUTF8` match `str` and `suffix` by UTF-8 characters.
)",
.examples{{"startsWithUTF8", "select startsWithUTF8('富强民主文明和谐', '富强');", ""}},
.categories{"String"}});
}

}
29 changes: 29 additions & 0 deletions tests/queries/0_stateless/02833_starts_ends_with_utf8.reference
@@ -0,0 +1,29 @@
-- { echoOn }
select startsWithUTF8('富强民主文明和谐', '富强');
1
select startsWithUTF8('富强民主文明和谐', '\xe5');
0
select startsWithUTF8('富强民主文明和谐', '');
1
SELECT startsWithUTF8('123', '123');
1
SELECT startsWithUTF8('123', '12');
1
SELECT startsWithUTF8('123', '1234');
0
SELECT startsWithUTF8('123', '');
1
select endsWithUTF8('富强民主文明和谐', '和谐');
1
select endsWithUTF8('富强民主文明和谐', '\x90');
0
select endsWithUTF8('富强民主文明和谐', '');
1
SELECT endsWithUTF8('123', '3');
1
SELECT endsWithUTF8('123', '23');
1
SELECT endsWithUTF8('123', '32');
0
SELECT endsWithUTF8('123', '');
1
19 changes: 19 additions & 0 deletions tests/queries/0_stateless/02833_starts_ends_with_utf8.sql
@@ -0,0 +1,19 @@
-- { echoOn }
select startsWithUTF8('富强民主文明和谐', '富强');
select startsWithUTF8('富强民主文明和谐', '\xe5');
select startsWithUTF8('富强民主文明和谐', '');

SELECT startsWithUTF8('123', '123');
SELECT startsWithUTF8('123', '12');
SELECT startsWithUTF8('123', '1234');
SELECT startsWithUTF8('123', '');

select endsWithUTF8('富强民主文明和谐', '和谐');
select endsWithUTF8('富强民主文明和谐', '\x90');
select endsWithUTF8('富强民主文明和谐', '');

SELECT endsWithUTF8('123', '3');
SELECT endsWithUTF8('123', '23');
SELECT endsWithUTF8('123', '32');
SELECT endsWithUTF8('123', '');
-- { echoOff }
2 changes: 2 additions & 0 deletions utils/check-style/aspell-ignore/en/aspell-dict.txt
Expand Up @@ -1404,6 +1404,7 @@ encodings
encryptions
endian
endsWith
endsWithUTF
enum
enum's
enums
Expand Down Expand Up @@ -2210,6 +2211,7 @@ src
stacktrace
stacktraces
startsWith
startsWithUTF
statbox
stateful
stddev
Expand Down

0 comments on commit 14aad35

Please sign in to comment.