-
Notifications
You must be signed in to change notification settings - Fork 1.2k
/
Copy pathRegexParser.h
284 lines (236 loc) · 11.7 KB
/
RegexParser.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
//-------------------------------------------------------------------------------------------------------
// Copyright (C) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
//-------------------------------------------------------------------------------------------------------
#pragma once
namespace UnifiedRegex
{
struct ParseError
{
bool isBody;
CharCount pos; // Position in unicode characters
CharCount encodedPos; // Position in underlying characters (eg utf-8 bytes)
HRESULT error;
ParseError(bool isBody, CharCount pos, CharCount encodedPos, HRESULT error);
};
template <typename EncodingPolicy, const bool IsLiteral>
class Parser : private EncodingPolicy, private Chars<char16>
{
private:
typedef typename EncodingPolicy::EncodedChar EncodedChar;
// A linked list node to track indices of surrogate pairs.
struct SurrogatePairTracker
{
const EncodedChar* location;
// If this surrogate pair is inside a range, then rangeLocation isn't null.
const EncodedChar* rangeLocation;
codepoint_t value;
uint32 length;
size_t multiUnits;
SurrogatePairTracker* next;
SurrogatePairTracker(const EncodedChar* location, codepoint_t value, uint32 length, size_t multiUnits)
: location(location)
, next(nullptr)
, value(value)
, length(length)
, multiUnits(multiUnits)
, rangeLocation(nullptr)
{
}
SurrogatePairTracker(const EncodedChar* location, const EncodedChar* rangeLocation, codepoint_t value, uint32 length, size_t multiUnits)
: location(location)
, next(nullptr)
, value(value)
, length(length)
, multiUnits(multiUnits)
, rangeLocation(rangeLocation)
{
}
bool IsInsideRange() const
{
return this->rangeLocation != nullptr;
}
};
static const CharCount initLitbufSize = 16;
Js::ScriptContext* scriptContext;
// Arena for nodes and items needed only during compilation
ArenaAllocator* ctAllocator;
// Standard characters using raw encoding character representation (eg char for utf-8)
StandardChars<EncodedChar>* standardEncodedChars;
// Standard characters using final character representation (eg char16 for Unicode)
StandardChars<Char>* standardChars;
#if ENABLE_REGEX_CONFIG_OPTIONS
DebugWriter* w;
#endif
const EncodedChar* input;
const EncodedChar* inputLim;
const EncodedChar* next;
bool inBody;
// Maximum number of capturing groups allowed, including the entire regexp, which is always
// considered a capturing group. Using INT16_MAX allows us to pass one value for each
// group, plus a few additional values, to a JavaScript function without overflowing the
// number of arguments. This is important, for example, in the implementation of
// String.prototype.replace, where the second argument is a function.
static const uint16 MAX_NUM_GROUPS = INT16_MAX;
uint16 numGroups; // determined in first parse
// Keeps track of how many capturing groups we've seen during parsing. We use an int, rather than
// a uint16, to be sure we don't overflow during parsing and only check it against MAX_NUM_GROUPS at
// the end. (We know we can't overflow an int because strings and regex literals are limited to
// 2G characters and therefore to 1G pairs of parentheses, which can fit into an int. I'd prefer
// to use size_t here, but making that change would go down a serious rabbit hole changing the
// interface to UnifiedRegex::Node::AccumDefineGroups.)
int nextGroupId;
// Buffer accumulating all literals.
// In compile-time allocator, must be transferred to runtime allocator when build program
Char* litbuf;
CharCount litbufLen;
CharCount litbufNext;
// During pass 0, if /u option for regex is provided, a linked list will be built up to
// track positions of surrogate pairs in the buffer. During pass 1, these linked lists will be used
// to figure out when to output a surrogate pair node.
SurrogatePairTracker* surrogatePairList;
SurrogatePairTracker* currentSurrogatePairNode;
bool unicodeFlagPresent;
bool caseInsensitiveFlagPresent;
bool dotAllFlagPresent;
// The following two variables are used to determine if the the surrogate pair has been encountered
// First holds the temporary location, second holds the value of the codepoint
const EncodedChar* tempLocationOfSurrogatePair;
// This will be set to a location when we are parsing a range in TermPass0, and cleared when we are out of it.
const EncodedChar* tempLocationOfRange;
codepoint_t codePointAtTempLocation;
// When a surrogate is added for tracking, this will be updated.
const EncodedChar* positionAfterLastSurrogate;
codepoint_t valueOfLastSurrogate;
// deferred error state.
ParseError* deferredIfNotUnicodeError;
ParseError* deferredIfUnicodeError;
private:
//
// Input buffer management
//
void SetPosition(const EncodedChar* input, const EncodedChar* inputLim, bool inBody);
// Current position in number of logical characters, regardless of underlying character encoding
inline CharCount Pos();
inline bool IsEOF();
inline bool ECCanConsume(CharCount n = 1);
inline EncodedChar ECLookahead(CharCount n = 0);
inline EncodedChar ECLookback(CharCount n = 0);
inline void ECConsume(CharCount n = 1);
inline void ECConsumeMultiUnit(CharCount n = 1);
inline void ECRevert(CharCount n = 1);
//
// Helpers
//
int TryParseExtendedUnicodeEscape(Char& c, bool& previousSurrogatePart, bool trackSurrogatePair = false);
void TrackIfSurrogatePair(codepoint_t codePoint, const EncodedChar* location, uint32 consumptionLength);
Node* CreateSurrogatePairAtom(char16 lower, char16 upper);
AltNode* AppendSurrogateRangeToDisjunction(codepoint_t lowerCodePoint, codepoint_t upperCodePoint, AltNode *lastAlttNode);
AltNode* AppendSurrogatePairToDisjunction(codepoint_t codePoint, AltNode *lastAlttNode);
//
// Errors
//
void Fail(HRESULT error);
void DeferredFailIfUnicode(HRESULT error);
void DeferredFailIfNotUnicode(HRESULT error);
inline void ECMust(EncodedChar ec, HRESULT error);
inline Char NextChar();
//
// Patterns/Disjunctions/Alternatives
//
void PatternPass0();
Node* PatternPass1();
Node* UnionNodes(Node* prev, Node* curr);
void DisjunctionPass0(int depth);
Node* DisjunctionPass1();
bool IsEndOfAlternative();
void EnsureLitbuf(CharCount size);
void AccumLiteral(MatchLiteralNode* deferredLiteralNode, Node* charOrLiteralNode);
Node* FinalTerm(Node* node, MatchLiteralNode* deferredLiteralNode);
void AlternativePass0(int depth);
Node* AlternativePass1();
//
// Terms
//
Node* NewLoopNode(CharCount lower, CharCountOrFlag upper, bool isGreedy, Node* body);
bool AtQuantifier();
bool OptNonGreedy();
CharCount RepeatCount();
void TermPass0(int depth);
Node* TermPass1(MatchCharNode* deferredCharNode, bool& previousSurrogatePart);
bool AtomEscapePass0();
bool AtomEscapePass1(Node*& node, MatchCharNode* deferredCharNode, bool& previousSurrogatePart);
bool SurrogatePairPass1(Node*& node, MatchCharNode* deferredCharNode, bool& previousSurrogatePart);
//
// Classes
//
bool AtSecondSingletonClassAtom();
void CharacterClassPass0();
template <bool containsSurrogates>
Node* CharacterClassPass1();
bool ClassEscapePass0(Char& singleton, bool& previousSurrogatePart);
Node* ClassEscapePass1(MatchCharNode* deferredCharNode, MatchSetNode* deferredSetNode, bool& previousSurrogatePart);
Node* GetNodeWithValidCharacterSet(EncodedChar ch);
//
// Options
//
void Options(RegexFlags& flags);
public:
Parser
( Js::ScriptContext* scriptContext
, ArenaAllocator* ctAllocator
, StandardChars<EncodedChar>* standardEncodedChars
, StandardChars<Char>* standardChars
, bool isUtf8
#if ENABLE_REGEX_CONFIG_OPTIONS
, DebugWriter* w
#endif
);
//
// Entry points
//
Node* ParseDynamic
( const EncodedChar* body // non null, null terminated (may contain embedded nulls)
, const EncodedChar* bodyLim // points to terminating null of above
, const EncodedChar* opts // may be null if no options, otherwise null terminated
, const EncodedChar* optsLim // if above non-null, points to terminating null of above
, RegexFlags& flags );
// (*) For ParseLiteral:
// - input string must be null terminated
// - inputLim may point to the terminating null in above or before it
// - if the later, input is known to be syntactically well-formed so that the parser
// will find the natural end of the regex literal before passing inputLim
// - input may contain nulls before the inputLim
Node* ParseLiteral
( const EncodedChar* input // non null, null terminated (may contain embedded nulls)
, const EncodedChar* inputLim // see (*) above
, CharCount& outBodyEncodedChars // in encoded characters, not including trailing '/'
, CharCount& outTotalEncodedChars // in encoded characters, including trailing '/' and any options
, CharCount& outBodyChars // in unicode characters, not including ttrailing '/'
, CharCount& outTotalChars // in unicode characters, including trailing '/' and any options
, RegexFlags& flags );
void ParseLiteralNoAST
( const EncodedChar* input // non null, null terminated
, const EncodedChar* inputLim // see (*) above
, CharCount& outBodyEncodedChars
, CharCount& outTotalEncodedChars
, CharCount& outBodyChars
, CharCount& outTotalChars );
template<const bool buildAST>
RegexPattern* CompileProgram
( Node* root,
const EncodedChar*& currentCharacter,
const CharCount totalLen,
const CharCount bodyChars,
const CharCount bodyEncodedChars,
const CharCount totalChars,
const RegexFlags flags );
static void CaptureEmptySourceAndNoGroups(Program* program);
// bodyChars is number of unicode characters in program body, which may be less than the number
// of underlying UTF-8 characters
void CaptureSourceAndGroups(Recycler* recycler, Program* program, const EncodedChar* body, CharCount bodyChars, CharCount bodyEncodedChars);
inline const Char* GetLitbuf() { return litbuf; }
void FreeBody();
size_t GetMultiUnits() { return this->m_cMultiUnits; }
};
}