/
alphaTokens.cpp
106 lines (81 loc) · 2.42 KB
/
alphaTokens.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#include <Functions/FunctionTokens.h>
#include <Functions/FunctionFactory.h>
#include <Common/StringUtils/StringUtils.h>
namespace DB
{
/** Functions that split strings into an array of strings or vice versa.
*
* alphaTokens(s[, max_substrings]) - select from the string subsequence `[a-zA-Z]+`.
*/
namespace
{
using Pos = const char *;
class SplitByAlphaImpl
{
private:
Pos pos;
Pos end;
std::optional<size_t> max_splits;
size_t splits;
bool max_substrings_includes_remaining_string;
public:
static constexpr auto name = "alphaTokens";
static bool isVariadic() { return true; }
static size_t getNumberOfArguments() { return 0; }
static ColumnNumbers getArgumentsThatAreAlwaysConstant() { return {1}; }
static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments)
{
checkArgumentsWithOptionalMaxSubstrings(func, arguments);
}
static constexpr auto strings_argument_position = 0uz;
void init(const ColumnsWithTypeAndName & arguments, bool max_substrings_includes_remaining_string_)
{
max_substrings_includes_remaining_string = max_substrings_includes_remaining_string_;
max_splits = extractMaxSplits(arguments, 1);
}
/// Called for each next string.
void set(Pos pos_, Pos end_)
{
pos = pos_;
end = end_;
splits = 0;
}
/// Get the next token, if any, or return false.
bool get(Pos & token_begin, Pos & token_end)
{
/// Skip garbage
while (pos < end && !isAlphaASCII(*pos))
++pos;
if (pos == end)
return false;
token_begin = pos;
if (max_splits)
{
if (max_substrings_includes_remaining_string)
{
if (splits == *max_splits - 1)
{
token_end = end;
pos = end;
return true;
}
}
else
if (splits == *max_splits)
return false;
}
while (pos < end && isAlphaASCII(*pos))
++pos;
token_end = pos;
++splits;
return true;
}
};
using FunctionSplitByAlpha = FunctionTokens<SplitByAlphaImpl>;
}
REGISTER_FUNCTION(SplitByAlpha)
{
factory.registerFunction<FunctionSplitByAlpha>();
factory.registerAlias("splitByAlpha", FunctionSplitByAlpha::name);
}
}