/
URLHierarchy.cpp
114 lines (84 loc) · 3.03 KB
/
URLHierarchy.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionTokens.h>
namespace DB
{
namespace
{
using Pos = const char *;
class URLPathHierarchyImpl
{
private:
Pos begin;
Pos pos;
Pos end;
Pos start;
public:
static constexpr auto name = "URLPathHierarchy";
static bool isVariadic() { return false; }
static size_t getNumberOfArguments() { return 1; }
static ColumnNumbers getArgumentsThatAreAlwaysConstant() { return {}; }
static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments)
{
FunctionArgumentDescriptors mandatory_args{
{"URL", &isString<IDataType>, nullptr, "String"},
};
validateFunctionArgumentTypes(func, arguments, mandatory_args);
}
static constexpr auto strings_argument_position = 0uz;
void init(const ColumnsWithTypeAndName & /*arguments*/, bool /*max_substrings_includes_remaining_string*/) {}
/// Called for each next string.
void set(Pos pos_, Pos end_)
{
begin = pos = pos_;
start = begin;
end = end_;
}
/// Get the next token, if any, or return false.
bool get(Pos & token_begin, Pos & token_end)
{
/// Code from URLParser.
if (pos == end)
return false;
if (pos == begin)
{
/// Let's parse everything that goes before the path
/// Assume that the protocol has already been changed to lowercase.
while (pos < end && ((*pos > 'a' && *pos < 'z') || (*pos > '0' && *pos < '9')))
++pos;
/** We will calculate the hierarchy only for URLs in which there is a protocol, and after it there are two slashes.
* (http, file - fit, mailto, magnet - do not fit), and after two slashes still at least something is there.
* For the rest, just return an empty array.
*/
if (pos == begin || pos == end || !(*pos++ == ':' && pos < end && *pos++ == '/' && pos < end && *pos++ == '/' && pos < end))
{
pos = end;
return false;
}
/// The domain for simplicity is everything that after the protocol and the two slashes, until the next slash or `?` or `#`
while (pos < end && !(*pos == '/' || *pos == '?' || *pos == '#'))
++pos;
start = pos;
if (pos != end)
++pos;
}
/// We go to the next `/` or `?` or `#`, skipping all those at the beginning.
while (pos < end && (*pos == '/' || *pos == '?' || *pos == '#'))
++pos;
if (pos == end)
return false;
while (pos < end && !(*pos == '/' || *pos == '?' || *pos == '#'))
++pos;
if (pos != end)
++pos;
token_begin = start;
token_end = pos;
return true;
}
};
using FunctionURLPathHierarchy = FunctionTokens<URLPathHierarchyImpl>;
}
REGISTER_FUNCTION(URLPathHierarchy)
{
factory.registerFunction<FunctionURLPathHierarchy>();
}
}