From 72b2c8b9301aa705aa883cb6ebd296365798371c Mon Sep 17 00:00:00 2001 From: Shahar Soel Date: Mon, 19 Dec 2016 01:01:56 +0200 Subject: [PATCH] Support Lexing with custom(None RegExp) Token Patterns. fixes #331 --- docs/custom_token_patterns.md | 76 ++++++++++++++++++ examples/lexer/README.md | 1 + .../lexer/custom_patterns/custom_patterns.js | 62 ++++++++++++++ .../custom_patterns/custom_patterns_spec.js | 23 ++++++ readme.md | 1 + src/scan/lexer.ts | 80 +++++++++++++------ src/scan/lexer_public.ts | 43 +++++++--- src/scan/tokens_public.ts | 38 +++++++-- src/utils/utils.ts | 5 +- test/scan/lexer_spec.ts | 48 ++++++++++- 10 files changed, 335 insertions(+), 42 deletions(-) create mode 100644 docs/custom_token_patterns.md create mode 100644 examples/lexer/custom_patterns/custom_patterns.js create mode 100644 examples/lexer/custom_patterns/custom_patterns_spec.js diff --git a/docs/custom_token_patterns.md b/docs/custom_token_patterns.md new file mode 100644 index 000000000..dba0c98e9 --- /dev/null +++ b/docs/custom_token_patterns.md @@ -0,0 +1,76 @@ +## Custom Token Patterns + +See: [**Runnable example**](../examples/lexer/custom_patterns/custom_patterns.js) for quick starting. + +### Background +Normally a Token's pattern is defined using a JavaScript regular expression: + +```JavaScript +let IntegerToken = createToken({name: "IntegerToken", pattern: /\d+/}) +``` + +However in some circumstances the capability to provide a custom pattern matching implementation may be required. +Perhaps a special Token which cannot be easily defined using regular expressions, or perhaps +to enable working around performance problems in a specific RegularExpression engine, for example: + +* [WebKit/Safari multiple orders of magnitude performance degradation for specific regExp patterns](https://bugs.webkit.org/show_bug.cgi?id=152578) 😞 + + +### Usage +A custom pattern must conform to the API of the [RegExp.prototype.exec](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/exec) +function. Additionally it must perform any matches from the **start** of the input. In RegExp semantics this means +that any custom pattern implementations should behave as if the [start of input anchor](http://www.rexegg.com/regex-anchors.html#caret) +has been used. + + +The basic syntax for supplying a custom pattern is defined by the [ICustomPattern](http://sap.github.io/chevrotain/documentation/0_20_0/interfaces/icustompattern.html) interface. +Example: + +```JavaScript +function matchInteger(text) { + let i = 0 + let charCode = text.charCodeAt(i) + while (charCode >= 48 && charCode <= 57) { + i++ + charCode = text.charCodeAt(i) + } + + // No match, must return null to conform with the RegExp.prototype.exec signature + if (i === 0) { + return null + } + else { + let matchedString = text.substring(0, i) + // according to the RegExp.prototype.exec API the first item in the returned array must be the whole matched string. + return [matchedString] + } +} + +let IntegerToken = createToken({ + name: "IntegerToken", + pattern: { + exec: matchInteger, + containsLineTerminator: false + }}) +``` + +The **containsLineTerminator** property is used by the lexer to properly compute the line/column numbers. +If the custom matched pattern could possibly include a line terminator then this property must be defined as "true". +Most Tokens can never contain a line terminator so the property is optional (false by default) which enables a shorter syntax: + +```JavaScript +let IntegerToken = createToken({ + name: "IntegerToken", + pattern: { + exec: matchInteger + }}) +``` + +Using an Object literal with only a single property is still a little verbose so an even more concise syntax is also supported: +```JavaScript +let IntegerToken = createToken({name: "IntegerToken", pattern: matchInteger}) +``` + + + + diff --git a/examples/lexer/README.md b/examples/lexer/README.md index 48289580a..c12ad5426 100644 --- a/examples/lexer/README.md +++ b/examples/lexer/README.md @@ -6,6 +6,7 @@ A few simple examples of using the Chevrotain Lexer to resolve some common lexin * [Keywords vs Identifiers](https://github.com/SAP/Chevrotain/blob/master/examples/lexer/keywords_vs_identifiers/keywords_vs_identifiers.js) * [Token Groups](https://github.com/SAP/Chevrotain/blob/master/examples/lexer/token_groups/token_groups.js) * [Lexer with Multiple Modes](https://github.com/SAP/Chevrotain/blob/master/examples/lexer/multi_mode_lexer/multi_mode_lexer.js) +* [Custom Token Patterns implementations](https://github.com/SAP/Chevrotain/blob/master/examples/lexer/custom_patterns/custom_patterns.js) to run all the lexer examples's tests: diff --git a/examples/lexer/custom_patterns/custom_patterns.js b/examples/lexer/custom_patterns/custom_patterns.js new file mode 100644 index 000000000..e65ee94fd --- /dev/null +++ b/examples/lexer/custom_patterns/custom_patterns.js @@ -0,0 +1,62 @@ +/** + * This example demonstrate usage of custom token patterns. + * custom token patterns allow implementing token matchers using arbitrary JavaScript code + * instead of being limited to only using regular expressions. + * + * For additional details see the docs: + * https://github.com/SAP/chevrotain/blob/master/docs/custom_token_patterns.md + */ +let chevrotain = require("chevrotain") +let createToken = chevrotain.createToken +let Lexer = chevrotain.Lexer + + +// First lets define our custom pattern for matching an Integer Literal. +// This function's signature matches the RegExp.prototype.exec function. +// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/exec +function matchInteger(text) { + let i = 0 + let charCode = text.charCodeAt(i) + while (charCode >= 48 && charCode <= 57) { + i++ + charCode = text.charCodeAt(i) + } + + // No match, must return null to conform with the RegExp.prototype.exec signature + if (i === 0) { + return null + } + else { + let matchedString = text.substring(0, i) + // according to the RegExp.prototype.exec API the first item in the returned array must be the whole matched string. + return [matchedString] + } +} + +// Now we can simply replace the regExp pattern with our custom pattern. +// Consult the Docs (linked above) for additional syntax variants. +let IntegerLiteral = createToken({name: "IntegerLiteral", pattern: matchInteger}) +let Comma = createToken({name: "Comma", pattern: /,/}) +let Whitespace = createToken({name: "Whitespace", pattern: /\s+/, group: Lexer.SKIPPED}) + +customPatternLexer = new Lexer( + [ + Whitespace, + Comma, + IntegerLiteral + ]) + +module.exports = { + + IntegerLiteral: IntegerLiteral, + Comma: Comma, + + tokenize: function(text) { + let lexResult = customPatternLexer.tokenize(text) + + if (lexResult.errors.length >= 1) { + throw new Error("sad sad panda lexing errors detected") + } + return lexResult + } +} diff --git a/examples/lexer/custom_patterns/custom_patterns_spec.js b/examples/lexer/custom_patterns/custom_patterns_spec.js new file mode 100644 index 000000000..8be549519 --- /dev/null +++ b/examples/lexer/custom_patterns/custom_patterns_spec.js @@ -0,0 +1,23 @@ +let assert = require("assert") +let customPatternExample = require("./custom_patterns") + +let tokenize = customPatternExample.tokenize +let Comma = customPatternExample.Comma +let IntegerLiteral = customPatternExample.IntegerLiteral + +describe('The Chevrotain Lexer ability to use custom pattern implementations.', () => { + + it('Can Lex a simple input using a Custom Integer Literal RegExp', () => { + let text = `1 , 2 , 3` + let lexResult = tokenize(text) + + assert.equal(lexResult.errors.length, 0) + assert.equal(lexResult.tokens.length, 5) + + expect(lexResult.tokens[0]).to.be.an.instanceof(IntegerLiteral) + expect(lexResult.tokens[1]).to.be.an.instanceof(Comma) + expect(lexResult.tokens[2]).to.be.an.instanceof(IntegerLiteral) + expect(lexResult.tokens[3]).to.be.an.instanceof(Comma) + expect(lexResult.tokens[4]).to.be.an.instanceof(IntegerLiteral) + }) +}) diff --git a/readme.md b/readme.md index 1c0ff1d62..ff90b58f1 100644 --- a/readme.md +++ b/readme.md @@ -38,6 +38,7 @@ any code generation phase. * [Multiple Lexer Modes][lexer_modes] depending on the context. * [Tokens Grouping][lexer_groups]. * [Different Token types for balancing performance, memory usage and ease of use](docs/token_types.md). + * [Custom Token patterns(none RegExp) support](docs/custom_token_patterns.md) * **No code generation** The Lexer does not require any code generation phase. 3. [**High Performance**][benchmark]. diff --git a/src/scan/lexer.ts b/src/scan/lexer.ts index d95d91c02..28ab0fec1 100644 --- a/src/scan/lexer.ts +++ b/src/scan/lexer.ts @@ -1,10 +1,12 @@ -import {Token, tokenName, ISimpleTokenOrIToken} from "./tokens_public" -import {TokenConstructor, ILexerDefinitionError, LexerDefinitionErrorType, Lexer, IMultiModeLexerDefinition} from "./lexer_public" +import {Token, tokenName, ISimpleTokenOrIToken, CustomPatternMatcherFunc} from "./tokens_public" +import { + TokenConstructor, ILexerDefinitionError, LexerDefinitionErrorType, Lexer, IMultiModeLexerDefinition, + IRegExpExec +} from "./lexer_public" import { reject, indexOf, map, - zipObject, isString, isUndefined, reduce, @@ -19,7 +21,8 @@ import { uniq, every, keys, - isArray + isArray, + isFunction } from "../utils/utils" import {isLazyTokenType, isSimpleTokenType} from "./tokens" @@ -28,7 +31,7 @@ export const DEFAULT_MODE = "defaultMode" export const MODES = "modes" export interface IAnalyzeResult { - allPatterns:RegExp[] + allPatterns:IRegExpExec[] patternIdxToClass:Function[] patternIdxToGroup:any[] patternIdxToLongerAltIdx:number[] @@ -38,6 +41,8 @@ export interface IAnalyzeResult { emptyGroups:{ [groupName:string]:Token[] } } +const CONTAINS_LINE_TERMINATOR = "containsLineTerminator" + export function analyzeTokenClasses(tokenClasses:TokenConstructor[]):IAnalyzeResult { let onlyRelevantClasses = reject(tokenClasses, (currClass) => { @@ -45,15 +50,27 @@ export function analyzeTokenClasses(tokenClasses:TokenConstructor[]):IAnalyzeRes }) let allTransformedPatterns = map(onlyRelevantClasses, (currClass) => { - return addStartOfInput(currClass[PATTERN]) - }) + let currPattern = currClass[PATTERN] - let allPatternsToClass = zipObject(allTransformedPatterns, onlyRelevantClasses) + if (isRegExp(currPattern)) { + return addStartOfInput(currPattern) + } + // CustomPatternMatcherFunc - custom patterns do not require any transformations, only wrapping in a RegExp Like object + else if (isFunction(currPattern)) { + return {exec: currPattern} + } + // ICustomPattern + else if (has(currPattern, "exec")) { + return currPattern + } + else { + throw Error("non exhaustive match") + } - let patternIdxToClass:any = map(allTransformedPatterns, (pattern) => { - return allPatternsToClass[pattern.toString()] }) + let patternIdxToClass = onlyRelevantClasses + let patternIdxToGroup = map(onlyRelevantClasses, (clazz:any) => { let groupName = clazz.GROUP if (groupName === Lexer.SKIPPED) { @@ -84,8 +101,16 @@ export function analyzeTokenClasses(tokenClasses:TokenConstructor[]):IAnalyzeRes let patternIdxToPopMode = map(onlyRelevantClasses, (clazz:any) => has(clazz, "POP_MODE")) let patternIdxToCanLineTerminator = map(allTransformedPatterns, (pattern:RegExp) => { - // TODO: unicode escapes of line terminators too? - return /\\n|\\r|\\s/g.test(pattern.source) + if (isRegExp(pattern)) { + // TODO: unicode escapes of line terminators too? + return /\\n|\\r|\\s/g.test(pattern.source) + } + else { + if (has(pattern, CONTAINS_LINE_TERMINATOR)) { + return pattern[CONTAINS_LINE_TERMINATOR] + } + return false + } }) let emptyGroups = reduce(onlyRelevantClasses, (acc, clazz:any) => { @@ -112,18 +137,13 @@ export function validatePatterns(tokenClasses:TokenConstructor[], validModesName let errors = [] let missingResult = findMissingPatterns(tokenClasses) - let validTokenClasses = missingResult.valid errors = errors.concat(missingResult.errors) - let invalidResult = findInvalidPatterns(validTokenClasses) - validTokenClasses = invalidResult.valid + let invalidResult = findInvalidPatterns(missingResult.valid) + let validTokenClasses = invalidResult.valid errors = errors.concat(invalidResult.errors) - errors = errors.concat(findEndOfInputAnchor(validTokenClasses)) - - errors = errors.concat(findUnsupportedFlags(validTokenClasses)) - - errors = errors.concat(findDuplicatePatterns(validTokenClasses)) + errors = errors.concat(validateRegExpPattern(validTokenClasses)) errors = errors.concat(findInvalidGroupType(validTokenClasses)) @@ -132,6 +152,19 @@ export function validatePatterns(tokenClasses:TokenConstructor[], validModesName return errors } +function validateRegExpPattern(tokenClasses:TokenConstructor[]):ILexerDefinitionError[] { + let errors = [] + let withRegExpPatterns = filter(tokenClasses, (currTokClass) => isRegExp(currTokClass[PATTERN])) + + errors = errors.concat(findEndOfInputAnchor(withRegExpPatterns)) + + errors = errors.concat(findUnsupportedFlags(withRegExpPatterns)) + + errors = errors.concat(findDuplicatePatterns(withRegExpPatterns)) + + return errors +} + export interface ILexerFilterResult { errors:ILexerDefinitionError[] valid:TokenConstructor[] @@ -157,12 +190,13 @@ export function findMissingPatterns(tokenClasses:TokenConstructor[]):ILexerFilte export function findInvalidPatterns(tokenClasses:TokenConstructor[]):ILexerFilterResult { let tokenClassesWithInvalidPattern = filter(tokenClasses, (currClass) => { let pattern = currClass[PATTERN] - return !isRegExp(pattern) + return !isRegExp(pattern) && !isFunction(pattern) && !has(pattern, "exec") }) let errors = map(tokenClassesWithInvalidPattern, (currClass) => { return { - message: "Token class: ->" + tokenName(currClass) + "<- static 'PATTERN' can only be a RegExp", + message: "Token class: ->" + tokenName(currClass) + "<- static 'PATTERN' can only be a RegExp, a" + + " Function matching the {CustomPatternMatcherFunc} type or an Object matching the {ICustomPattern} interface.", type: LexerDefinitionErrorType.INVALID_PATTERN, tokenClasses: [currClass] } @@ -361,8 +395,6 @@ export function performRuntimeChecks(lexerDefinition:IMultiModeLexerDefinition): }) } }) - - // lexerDefinition.modes[currModeName] = reject(currModeValue, (currTokClass) => isUndefined(currTokClass)) }) } diff --git a/src/scan/lexer_public.ts b/src/scan/lexer_public.ts index cf9eb3957..1c0043a32 100644 --- a/src/scan/lexer_public.ts +++ b/src/scan/lexer_public.ts @@ -1,7 +1,21 @@ -import {Token, LazyTokenCacheData, getImage, getStartLine, getStartColumn, ISimpleTokenOrIToken} from "./tokens_public" import { - validatePatterns, analyzeTokenClasses, countLineTerminators, DEFAULT_MODE, performRuntimeChecks, checkLazyMode, - checkSimpleMode, cloneEmptyGroups + Token, + LazyTokenCacheData, + getImage, + getStartLine, + getStartColumn, + ISimpleTokenOrIToken, + CustomPatternMatcherFunc +} from "./tokens_public" +import { + validatePatterns, + analyzeTokenClasses, + countLineTerminators, + DEFAULT_MODE, + performRuntimeChecks, + checkLazyMode, + checkSimpleMode, + cloneEmptyGroups } from "./lexer" import { cloneObj, @@ -19,8 +33,13 @@ import { mapValues } from "../utils/utils" import { - fillUpLineToOffset, getStartColumnFromLineToOffset, getStartLineFromLineToOffset, augmentTokenClasses, - createSimpleLazyToken, LazyTokenCreator, createLazyTokenInstance + fillUpLineToOffset, + getStartColumnFromLineToOffset, + getStartLineFromLineToOffset, + augmentTokenClasses, + createSimpleLazyToken, + LazyTokenCreator, + createLazyTokenInstance } from "./tokens" export interface TokenConstructor extends Function { @@ -80,6 +99,10 @@ export interface IMultiModeLexerDefinition { defaultMode:string } +export interface IRegExpExec { + exec:CustomPatternMatcherFunc +} + export class Lexer { public static SKIPPED = "This marks a skipped Token pattern, this means each token identified by it will" + @@ -92,7 +115,7 @@ export class Lexer { protected isSimpleTokenMode protected modes:string[] = [] protected defaultMode:string - protected allPatterns:{ [modeName:string]:RegExp[] } = {} + protected allPatterns:{ [modeName:string]:IRegExpExec[] } = {} protected patternIdxToClass:{ [modeName:string]:Function[] } = {} protected patternIdxToGroup:{ [modeName:string]:string[] } = {} protected patternIdxToLongerAltIdx:{ [modeName:string]:number[] } = {} @@ -472,8 +495,8 @@ export class Lexer { text = text.substr(1) offset++ for (j = 0; j < currModePatterns.length; j++) { - foundResyncPoint = currModePatterns[j].test(text) - if (foundResyncPoint) { + foundResyncPoint = currModePatterns[j].exec(text) + if (foundResyncPoint !== null) { break } } @@ -609,8 +632,8 @@ export class Lexer { text = text.substr(1) offset++ for (j = 0; j < currModePatterns.length; j++) { - foundResyncPoint = currModePatterns[j].test(text) - if (foundResyncPoint) { + foundResyncPoint = currModePatterns[j].exec(text) + if (foundResyncPoint !== null) { break } } diff --git a/src/scan/tokens_public.ts b/src/scan/tokens_public.ts index 11cc57373..1e322101d 100644 --- a/src/scan/tokens_public.ts +++ b/src/scan/tokens_public.ts @@ -1,6 +1,6 @@ import {isString, isRegExp, isFunction, isUndefined, assignNoOverwrite, has} from "../utils/utils" import {functionName, defineNameProp} from "../lang/lang_extensions" -import {Lexer, TokenConstructor} from "./lexer_public" +import {Lexer, TokenConstructor, IRegExpExec} from "./lexer_public" import { isInheritanceBasedToken, getStartLineFromLazyToken, @@ -14,11 +14,39 @@ import { augmentTokenClasses } from "./tokens" +/** + * The type of custom pattern matcher functions. + * Matches should only be done on the start of the text. + * Note that this is identical to the signature of RegExp.prototype.exec + * + * This should behave as if the regExp match is using a start of input anchor. + * So: for example if a custom matcher is implemented for Tokens matching: /\w+/ + * The implementation of the custom matcher must implement a custom matcher for /^\w+/. + */ +export type CustomPatternMatcherFunc = (test:string) => RegExpExecArray + +/** + * Interface for custom user provided token pattern matchers. + */ +export interface ICustomPattern { + /** + * The custom pattern implementation. + * @see CustomPatternMatcherFunc + */ + exec:CustomPatternMatcherFunc + /** + * Flag indicating if this custom pattern may contain line terminators. + * This is required to avoid errors in the line/column numbering. + * @default false - if this property was not explicitly defined. + */ + containsLineTerminator?:boolean +} + /** * This can be used to improve the quality/readability of error messages or syntax diagrams. * * @param {Function} clazz - A constructor for a Token subclass - * @returns {string} - The Human readable label a Token if it exists. + * @returns {string} - The Human readable label for a Token if it exists. */ export function tokenLabel(clazz:Function):string { if (hasTokenLabel(clazz)) { @@ -47,12 +75,11 @@ export function tokenName(clazz:Function):string { } } -// TODO: uppper or lower case name? or support both??? export interface ITokenConfig { name:string parent?:TokenConstructor label?:string - pattern?:RegExp + pattern?:RegExp | CustomPatternMatcherFunc | ICustomPattern group?:string|any push_mode?:string pop_mode?:boolean @@ -67,7 +94,6 @@ const POP_MODE = "pop_mode" const LONGER_ALT = "longer_alt" /** - * * @param {ITokenConfig} config - The configuration for * @returns {TokenConstructor} - A constructor for the new Token subclass */ @@ -128,7 +154,7 @@ export function extendSimpleLazyToken(tokenName:string, patternOrParent:any = un * extend and create Token subclasses in a less verbose manner * * @param {string} tokenName - The name of the new TokenClass - * @param {RegExp|Function} patternOrParent - RegExp Pattern or Parent Token Constructor + * @param {RegExp|CustomPatternMatcherFunc|Function} patternOrParent - RegExp Pattern or Parent Token Constructor * @param {Function} parentConstructor - The Token class to be extended * @returns {Function} - A constructor for the new extended Token subclass */ diff --git a/src/utils/utils.ts b/src/utils/utils.ts index 7f6b61431..972cb76af 100644 --- a/src/utils/utils.ts +++ b/src/utils/utils.ts @@ -138,7 +138,10 @@ export function pick(obj:Object, predicate:(item) => boolean) { } export function has(obj:any, prop:string):boolean { - return obj.hasOwnProperty(prop) + if (isObject(obj)) { + return obj.hasOwnProperty(prop) + } + return false } export function contains(arr:T[], item):boolean { diff --git a/test/scan/lexer_spec.ts b/test/scan/lexer_spec.ts index 312633782..4d6ffecb5 100644 --- a/test/scan/lexer_spec.ts +++ b/test/scan/lexer_spec.ts @@ -11,7 +11,8 @@ import { getStartColumn, getStartLine, getEndLine, - getEndColumn, SimpleLazyToken + getEndColumn, + SimpleLazyToken, createToken } from "../../src/scan/tokens_public" import {Lexer, LexerDefinitionErrorType, IMultiModeLexerDefinition} from "../../src/scan/lexer_public" import { @@ -36,6 +37,7 @@ function defineLexerSpecs(contextName, extendToken, tokenMatcher) { const IntegerTok = extendToken("IntegerTok", /^[1-9]\d*/) const IdentifierTok = extendToken("IdentifierTok", /^[A-Za-z_]\w*/) const BambaTok = extendToken("BambaTok", /^bamba/) + BambaTok.LONGER_ALT = IdentifierTok @@ -910,6 +912,50 @@ function defineLexerSpecs(contextName, extendToken, tokenMatcher) { expect(badLexer.lexerDefinitionErrors[0].message).to.include("NotSimpleTok1") expect(badLexer.lexerDefinitionErrors[0].message).to.include("NotSimpleTok2") }) + + context("custom pattern", () => { + + + function defineCustomPatternSpec(variant, customPattern) { + it(variant, () => { + let A = createToken({name: "A", pattern: /A/}) + let B = createToken({name: "B", pattern: customPattern}) + let WS = createToken({ + name: "WS", pattern: { + exec: (text) => /^\s+/.exec(text), + containsLineTerminator: true + }, group: Lexer.SKIPPED + }) + + + let lexerDef:any = [WS, A, B] + + let myLexer = new Lexer(lexerDef) + let lexResult = myLexer.tokenize("B A\n B ") + expect(lexResult.tokens).to.have.length(3) + expect(lexResult.tokens[0]).to.be.instanceOf(B) + expect(lexResult.tokens[1]).to.be.instanceOf(A) + expect(lexResult.tokens[2]).to.be.instanceOf(B) + + let lastToken = lexResult.tokens[2] + expect(getStartLine(lastToken)).to.equal(2) + expect(getEndLine(lastToken)).to.equal(2) + expect(getStartColumn(lastToken)).to.equal(2) + expect(getEndColumn(lastToken)).to.equal(2) + expect(getStartOffset(lastToken)).to.equal(5) + expect(getEndOffset(lastToken)).to.equal(5) + }) + } + + defineCustomPatternSpec("With short function syntax", (text) => /^B/.exec(text)) + defineCustomPatternSpec("with explicit canContainLinerTerminator", { + exec: (text) => /^B/.exec(text), + containsLineTerminator: false + }) + defineCustomPatternSpec("with implicit canContainLinerTerminator", { + exec: (text) => /^B/.exec(text) + }) + }) }) }) })