-
Notifications
You must be signed in to change notification settings - Fork 200
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Support Lexing with custom(None RegExp) Token Patterns.
fixes #331
- Loading branch information
Shahar Soel
committed
Dec 24, 2016
1 parent
55c7db9
commit 72b2c8b
Showing
10 changed files
with
335 additions
and
42 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
## Custom Token Patterns | ||
|
||
See: [**Runnable example**](../examples/lexer/custom_patterns/custom_patterns.js) for quick starting. | ||
|
||
### Background | ||
Normally a Token's pattern is defined using a JavaScript regular expression: | ||
|
||
```JavaScript | ||
let IntegerToken = createToken({name: "IntegerToken", pattern: /\d+/}) | ||
``` | ||
|
||
However in some circumstances the capability to provide a custom pattern matching implementation may be required. | ||
Perhaps a special Token which cannot be easily defined using regular expressions, or perhaps | ||
to enable working around performance problems in a specific RegularExpression engine, for example: | ||
|
||
* [WebKit/Safari multiple orders of magnitude performance degradation for specific regExp patterns](https://bugs.webkit.org/show_bug.cgi?id=152578) 😞 | ||
|
||
|
||
### Usage | ||
A custom pattern must conform to the API of the [RegExp.prototype.exec](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/exec) | ||
function. Additionally it must perform any matches from the **start** of the input. In RegExp semantics this means | ||
that any custom pattern implementations should behave as if the [start of input anchor](http://www.rexegg.com/regex-anchors.html#caret) | ||
has been used. | ||
|
||
|
||
The basic syntax for supplying a custom pattern is defined by the [ICustomPattern](http://sap.github.io/chevrotain/documentation/0_20_0/interfaces/icustompattern.html) interface. | ||
Example: | ||
|
||
```JavaScript | ||
function matchInteger(text) { | ||
let i = 0 | ||
let charCode = text.charCodeAt(i) | ||
while (charCode >= 48 && charCode <= 57) { | ||
i++ | ||
charCode = text.charCodeAt(i) | ||
} | ||
|
||
// No match, must return null to conform with the RegExp.prototype.exec signature | ||
if (i === 0) { | ||
return null | ||
} | ||
else { | ||
let matchedString = text.substring(0, i) | ||
// according to the RegExp.prototype.exec API the first item in the returned array must be the whole matched string. | ||
return [matchedString] | ||
} | ||
} | ||
|
||
let IntegerToken = createToken({ | ||
name: "IntegerToken", | ||
pattern: { | ||
exec: matchInteger, | ||
containsLineTerminator: false | ||
}}) | ||
``` | ||
|
||
The **containsLineTerminator** property is used by the lexer to properly compute the line/column numbers. | ||
If the custom matched pattern could possibly include a line terminator then this property must be defined as "true". | ||
Most Tokens can never contain a line terminator so the property is optional (false by default) which enables a shorter syntax: | ||
|
||
```JavaScript | ||
let IntegerToken = createToken({ | ||
name: "IntegerToken", | ||
pattern: { | ||
exec: matchInteger | ||
}}) | ||
``` | ||
|
||
Using an Object literal with only a single property is still a little verbose so an even more concise syntax is also supported: | ||
```JavaScript | ||
let IntegerToken = createToken({name: "IntegerToken", pattern: matchInteger}) | ||
``` | ||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
/** | ||
* This example demonstrate usage of custom token patterns. | ||
* custom token patterns allow implementing token matchers using arbitrary JavaScript code | ||
* instead of being limited to only using regular expressions. | ||
* | ||
* For additional details see the docs: | ||
* https://github.com/SAP/chevrotain/blob/master/docs/custom_token_patterns.md | ||
*/ | ||
let chevrotain = require("chevrotain") | ||
let createToken = chevrotain.createToken | ||
let Lexer = chevrotain.Lexer | ||
|
||
|
||
// First lets define our custom pattern for matching an Integer Literal. | ||
// This function's signature matches the RegExp.prototype.exec function. | ||
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/exec | ||
function matchInteger(text) { | ||
let i = 0 | ||
let charCode = text.charCodeAt(i) | ||
while (charCode >= 48 && charCode <= 57) { | ||
i++ | ||
charCode = text.charCodeAt(i) | ||
} | ||
|
||
// No match, must return null to conform with the RegExp.prototype.exec signature | ||
if (i === 0) { | ||
return null | ||
} | ||
else { | ||
let matchedString = text.substring(0, i) | ||
// according to the RegExp.prototype.exec API the first item in the returned array must be the whole matched string. | ||
return [matchedString] | ||
} | ||
} | ||
|
||
// Now we can simply replace the regExp pattern with our custom pattern. | ||
// Consult the Docs (linked above) for additional syntax variants. | ||
let IntegerLiteral = createToken({name: "IntegerLiteral", pattern: matchInteger}) | ||
let Comma = createToken({name: "Comma", pattern: /,/}) | ||
let Whitespace = createToken({name: "Whitespace", pattern: /\s+/, group: Lexer.SKIPPED}) | ||
|
||
customPatternLexer = new Lexer( | ||
[ | ||
Whitespace, | ||
Comma, | ||
IntegerLiteral | ||
]) | ||
|
||
module.exports = { | ||
|
||
IntegerLiteral: IntegerLiteral, | ||
Comma: Comma, | ||
|
||
tokenize: function(text) { | ||
let lexResult = customPatternLexer.tokenize(text) | ||
|
||
if (lexResult.errors.length >= 1) { | ||
throw new Error("sad sad panda lexing errors detected") | ||
} | ||
return lexResult | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
let assert = require("assert") | ||
let customPatternExample = require("./custom_patterns") | ||
|
||
let tokenize = customPatternExample.tokenize | ||
let Comma = customPatternExample.Comma | ||
let IntegerLiteral = customPatternExample.IntegerLiteral | ||
|
||
describe('The Chevrotain Lexer ability to use custom pattern implementations.', () => { | ||
|
||
it('Can Lex a simple input using a Custom Integer Literal RegExp', () => { | ||
let text = `1 , 2 , 3` | ||
let lexResult = tokenize(text) | ||
|
||
assert.equal(lexResult.errors.length, 0) | ||
assert.equal(lexResult.tokens.length, 5) | ||
|
||
expect(lexResult.tokens[0]).to.be.an.instanceof(IntegerLiteral) | ||
expect(lexResult.tokens[1]).to.be.an.instanceof(Comma) | ||
expect(lexResult.tokens[2]).to.be.an.instanceof(IntegerLiteral) | ||
expect(lexResult.tokens[3]).to.be.an.instanceof(Comma) | ||
expect(lexResult.tokens[4]).to.be.an.instanceof(IntegerLiteral) | ||
}) | ||
}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.