/
Tokenizers.js
135 lines (116 loc) · 3.49 KB
/
Tokenizers.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
Tokenizers = function() {
var for_kv = function(object, body) {
for (var k in object) {
if (object.hasOwnProperty(k)) {
body(k, object[k]);
}
}
};
var escape_for_regexp = function(text) {
return text.replace(/[\]\[\\^$*+?{}\.()|]/g, function(x) { return '\\' + x });
};
var newtype = function() {
var ident = {};
return {
wrap: function(x) { return { ident: ident, __value: x } },
unwrap: function(x) {
if (x.ident === ident) {
return x.__value;
}
else {
throw "Attempt to unwrap non-matching newtype";
}
}
};
};
var $$ = {};
var Tokenizer = newtype();
$$.regexp = function(tokens) {
return Tokenizer.wrap(function(str) {
var bestMatch = null;
var bestFunc = null;
for_kv(tokens, function(k,v) {
var m = new RegExp('^' + k).exec(str);
if (m && (!bestMatch || m[0].length > bestMatch[0].length)) {
bestMatch = m;
bestFunc = v;
}
});
// don't match the whole string in case we are in the middle of typing a token
// we use \0 to mean "eof" so this will pass.
if (bestFunc && bestMatch[0].length < str.length) {
return [bestFunc(bestMatch), str.slice(bestMatch[0].length)];
}
else {
return null;
}
})
};
$$.choice = function(tokenizers) {
return Tokenizer.wrap(function(str) {
for (var i = 0; i < tokenizers.length; ++i) {
var tokresult = tokenizers[i](str);
if (tokresult) {
return tokresult;
}
}
return null;
});
};
$$.string = function(str, func) {
var d = {};
d[escape_for_regexp(str)] = func;
return $$.regexp(d);
};
$$.map = function(f, tokenizer) {
return Tokenizer.wrap(function(str) {
var tokresult = Tokenizer.unwrap(tokenizer)(str);
if (tokresult) {
return [f(tokresult[0]), tokresult[1]];
}
else {
return null;
}
});
};
$$.prefix = function(tokenizers) {
return Tokenizer.wrap(function(str) {
var ret = [];
for (var i = 0; i < tokenizers.length; i++) {
var tokresult = Tokenizer.unwrap(tokenizers[i])(str);
if (tokresult) {
ret.push(tokresult[0]);
str = tokresult[1];
}
else {
return [ ret, str ];
}
}
return [ ret, str ];
});
};
$$.nonempty_choice = function(tokenizers) {
if (tokenizers.length == 0) { return Tokenizer.wrap(function(str) { return null }) }
if (tokenizers.length == 1) { return tokenizers[0] }
return Tokenizer.wrap(function(str) {
for (var i = 0; i < tokenizers.length; i++) {
var tokresult = Tokenizer.unwrap(tokenizers[i])(str);
if (tokresult && tokresult[1].length < str.length) {
return tokresult;
}
}
return null; // er.. if they *all* accept the empty string, then... er...
});
};
// run_tokenizer takes a tokenizer and a string and returns either
// * null, indicating failure, or
// * a two element array [ value, remaining_string ]
$$.run_tokenizer = function(tok, str) {
return Tokenizer.unwrap(tok)(str);
};
// marks a non-algebraic tokenizer which needs further abstraction
$$.HACK_wrap = function(tok) {
return Tokenizer.wrap(tok);
};
return $$;
};