-
Notifications
You must be signed in to change notification settings - Fork 1.2k
/
Copy pathOctoquadIdentifier.h
161 lines (129 loc) · 5.78 KB
/
OctoquadIdentifier.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
//-------------------------------------------------------------------------------------------------------
// Copyright (C) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
//-------------------------------------------------------------------------------------------------------
//
// Matchers for pattern of form:
// pattern ::= atom{8} '|' atom{8}
// atom ::= A | [...charset drawn from A's...]
// where:
// - A is a set of exactly four ASCII characters
// - The pattern ignores case
// - The pattern includes the global flag
// An example pattern would be "ABCdABCd|aDcAbBcD".
#pragma once
namespace UnifiedRegex
{
// ----------------------------------------------------------------------
// Trigrams
// ----------------------------------------------------------------------
struct TrigramInfo {
static const int PatternLength=8;
static const int MaxResults=32;
Field(bool) isTrigramPattern;
Field(bool) hasCachedResultString;
Field(int) triPat1;
Field(int) triPat2;
Field(int) resultCount;
Field(int) offsets[MaxResults];
Field(Js::JavascriptString *) cachedResult[MaxResults];
TrigramInfo(__in_ecount(PatternLength) char* pat1,__in_ecount(PatternLength) char* pat2, Recycler* recycler);
};
struct PatternTri {
RegexPattern* pattern;
int encodedPattern;
};
struct TrigramStart {
static const int MaxPatPerStart=12;
int count;
PatternTri patterns[MaxPatPerStart];
};
struct TrigramAlphabet {
static const int AlphaCount=4;
static const int AsciiTableSize=128;
static const int BitsNotInAlpha=4;
static const int TrigramMapSize=221;
static const int TrigramNotInPattern=65;
static const char LowerCaseBit=0x20;
static const char UpperCaseMask=0x5f;
static const int TrigramCount=64;
static const int MaxCachedStarts=48;
TrigramStart trigramStarts[TrigramCount];
char alpha[AlphaCount];
char alphaBits[AsciiTableSize];
char trigramMap[TrigramMapSize];
const char16* input;
int inputLen;
void InitTrigramMap();
bool AddStarts(__in_xcount(TrigramInfo::PatternLength) char* pat1,__in_xcount(TrigramInfo::PatternLength) char* pat2, RegexPattern* pattern);
void MegaMatch(__in_ecount(inputLen) const char16* input,int inputLen);
};
// ----------------------------------------------------------------------
// OctoquadIdentifier
// ----------------------------------------------------------------------
class OctoquadIdentifier : private Chars<char16>
{
friend class OctoquadMatcher;
public:
static const int NumPatterns = 2;
private:
// Number of characters in the alphabet encountered so far
int numCodes;
// Maps a character code to the character
char (&codeToChar)[TrigramAlphabet::AlphaCount];
// Maps a character to its code 0-3. This array is passed into the constructor and only indexes for characters in the
// alphabet are updated.
char (&charToCode)[TrigramAlphabet::AsciiTableSize];
// For each octoquad pattern, each byte contains a 4-bit pattern. One character will be represented as 0x1, 0x2, 0x4, or
// 0x8 since it's a quad alphabet. A character class in the pattern can cause the bit pattern to be a combination of the
// character bits.
char patternBits[NumPatterns][TrigramInfo::PatternLength];
int currPatternLength;
int currPatternNum;
void SetTrigramAlphabet(Js::ScriptContext * scriptContext,
__in_xcount(regex::TrigramAlphabet::AlphaCount) char* alpha,
__in_xcount(regex::TrigramAlphabet::AsciiTableSize) char* alphaBits);
public:
static bool Qualifies(const Program *const program);
OctoquadIdentifier(
const int numCodes,
char (&codeToChar)[TrigramAlphabet::AlphaCount],
char (&charToCode)[TrigramAlphabet::AsciiTableSize]);
// Returns -1 if character not in quad alphabet and the alphabet is full
int GetOrAddCharCode(const Char c);
bool BeginConcat();
bool CouldAppend(const CharCount n) const;
bool AppendChar(Char c);
bool BeginUnions();
bool UnionChar(Char c);
void EndUnions();
bool IsOctoquad();
void InitializeTrigramInfo(Js::ScriptContext* scriptContext, RegexPattern* const pattern);
};
// ----------------------------------------------------------------------
// OctoquadMatcher
// ----------------------------------------------------------------------
class OctoquadMatcher : private Chars<char16>
{
private:
OctoquadMatcher(const StandardChars<Char>* standardChars, CaseInsensitive::MappingSource mappingSource, OctoquadIdentifier* identifier);
Field(Char) codeToChar[TrigramAlphabet::AlphaCount];
// Maps characters (0..AsciTableSize-1) to 0 if not in alphabet, or 0x1, 0x2, 0x4 or 0x8.
// Allocated and filled only if invoke Match below.
Field(uint8) charToBits[TrigramAlphabet::AsciiTableSize];
Field(uint32) patterns[OctoquadIdentifier::NumPatterns];
public:
static OctoquadMatcher *New(Recycler* recycler, const StandardChars<Char>* standardChars, CaseInsensitive::MappingSource mappingSource, OctoquadIdentifier* identifier);
bool Match
( const Char* const input
, const CharCount inputLength
, CharCount& offset
#if ENABLE_REGEX_CONFIG_OPTIONS
, RegexStats* stats
#endif
);
#if ENABLE_REGEX_CONFIG_OPTIONS
void Print(DebugWriter* w) const;
#endif
};
}