-
Notifications
You must be signed in to change notification settings - Fork 202
/
xml_lexer.js
165 lines (135 loc) · 3.87 KB
/
xml_lexer.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import { createToken as createTokenOrg, Lexer } from "chevrotain";
// A little mini DSL for easier lexer definition.
const fragments = {};
const f = fragments;
function FRAGMENT(name, def) {
fragments[name] = typeof def === "string" ? def : def.source;
}
function makePattern(strings, ...args) {
let combined = "";
for (let i = 0; i < strings.length; i++) {
combined += strings[i];
if (i < args.length) {
let pattern = args[i];
// if a TokenType was passed
if (args[i].PATTERN) {
pattern = args[i].PATTERN;
}
const patternSource =
typeof pattern === "string" ? pattern : pattern.source;
// By wrapping in a RegExp (none) capturing group
// We enabled the safe usage of qualifiers and assertions.
combined += `(?:${patternSource})`;
}
}
return new RegExp(combined);
}
const tokensArray = [];
export const tokensDictionary = {};
function createToken(options) {
const newTokenType = createTokenOrg(options);
tokensArray.push(newTokenType);
tokensDictionary[options.name] = newTokenType;
return newTokenType;
}
FRAGMENT(
"NameStartChar",
"([a-zA-Z]|\\u2070-\\u218F|\\u2C00-\\u2FEF|\\u3001-\\uD7FF|\\uF900-\\uFDCF|\\uFDF0-\\uFFFD)",
);
FRAGMENT(
"NameChar",
makePattern`${f.NameStartChar}|-|_|\\.|\\d|\\u00B7||[\\u0300-\\u036F]|[\\u203F-\\u2040]`,
);
FRAGMENT("Name", makePattern`${f.NameStartChar}(${f.NameChar})*`);
const Comment = createToken({
name: "Comment",
pattern: /<!--.*?-->/,
// A Comment may span multiple lines.
line_breaks: true,
});
const CData = createToken({ name: "CData", pattern: /<!\[CDATA\[.*?]]>/ });
const DTD = createToken({
name: "DTD",
pattern: /<!.*?>/,
group: Lexer.SKIPPED,
});
const EntityRef = createToken({
name: "EntityRef",
pattern: makePattern`&${f.Name};`,
});
const CharRef = createToken({
name: "CharRef",
pattern: /&#\d+;|&#x[a-fA-F0-9]/,
});
const SEA_WS = createToken({
name: "SEA_WS",
pattern: /( |\t|\n|\r\n)+/,
});
const XMLDeclOpen = createToken({
name: "XMLDeclOpen",
pattern: /<\?xml[ \t\r\n]/,
push_mode: "INSIDE",
});
const SLASH_OPEN = createToken({
name: "SLASH_OPEN",
pattern: /<\//,
push_mode: "INSIDE",
});
const OPEN = createToken({ name: "OPEN", pattern: /</, push_mode: "INSIDE" });
const PROCESSING_INSTRUCTION = createToken({
name: "PROCESSING_INSTRUCTION",
pattern: makePattern`<\\?${f.Name}.*\\?>`,
});
const TEXT = createToken({ name: "TEXT", pattern: /[^<&]+/ });
const CLOSE = createToken({ name: "CLOSE", pattern: />/, pop_mode: true });
const SPECIAL_CLOSE = createToken({
name: "SPECIAL_CLOSE",
pattern: /\?>/,
pop_mode: true,
});
const SLASH_CLOSE = createToken({
name: "SLASH_CLOSE",
pattern: /\/>/,
pop_mode: true,
});
const SLASH = createToken({ name: "SLASH", pattern: /\// });
const STRING = createToken({
name: "STRING",
pattern: /"[^<"]*"|'[^<']*'/,
});
const EQUALS = createToken({ name: "EQUALS", pattern: /=/ });
const Name = createToken({ name: "Name", pattern: makePattern`${f.Name}` });
const S = createToken({
name: "S",
pattern: /[ \t\r\n]/,
group: Lexer.SKIPPED,
});
const xmlLexerDefinition = {
defaultMode: "OUTSIDE",
modes: {
OUTSIDE: [
Comment,
CData,
DTD,
EntityRef,
CharRef,
SEA_WS,
XMLDeclOpen,
SLASH_OPEN,
OPEN,
PROCESSING_INSTRUCTION,
TEXT,
],
INSIDE: [CLOSE, SPECIAL_CLOSE, SLASH_CLOSE, SLASH, EQUALS, STRING, Name, S],
},
};
export const xmlLexer = new Lexer(xmlLexerDefinition, {
// Reducing the amount of position tracking can provide a small performance boost (<10%)
// Likely best to keep the full info for better error position reporting and
// to expose "fuller" ITokens from the Lexer.
positionTracking: "full",
ensureOptimizations: false,
// TODO: inspect definitions for XML line terminators
lineTerminatorCharacters: ["\n"],
lineTerminatorsPattern: /\n|\r\n/g,
});