Skip to content

Commit

Permalink
Support Lexing with custom(None RegExp) Token Patterns.
Browse files Browse the repository at this point in the history
fixes #331
  • Loading branch information
Shahar Soel authored and bd82 committed Dec 22, 2016
1 parent 55c7db9 commit d9ef5dc
Show file tree
Hide file tree
Showing 6 changed files with 230 additions and 42 deletions.
58 changes: 58 additions & 0 deletions docs/custom_token_patterns.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
## Custom Token Patterns


### Background
Normally a Token's pattern is defined using a JavaScript regular expression:

```JavaScript
let IntegerToken = createToken({name: "IntegerToken", pattern: /\d+/})
```

However in some circumstances the capability to provide a custom pattern matching implementation may be required.
Perhaps a special Token which cannot be easily defined using regular expressions, or perhaps
to enable working around performance problems in a specific RegularExpression engine, for example:

* [WebKit/Safari multiple orders of magnitude performance degradation for specific regExp patterns](https://bugs.webkit.org/show_bug.cgi?id=152578) 😞


### Usage
A custom pattern must conform to the API of the [RegExp.prototype.exec](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/exec)
function. Additionally it must perform any matches from the **start** of the input. In RegExp semantics this means
that any custom pattern implementations should behave as if the [start of input anchor](http://www.rexegg.com/regex-anchors.html#caret)
has been used.


The basic syntax for supplying a custom pattern is defined by the [ICustomPattern](TODO:LINK) interface.
Example:

```JavaScript

function matchInteger(text) {
let i = 0
let charCode = text.charCodeAt(i)
while (charCode >= 48 && charCode <= 57) {
i++
}

// No match, must return null to conform with the RegExp.prototype.exec signature
if (i === 0) {
return null
}
else {
let matchedString = text.substring(0, i)
// according to the RegExp.prototype.exec API the first item in the returned array must be the whole matched string.
return [matchedString]
}
}

let IntegerToken = createToken({
name: "IntegerToken",
pattern: {
exec: matchInteger,
containsLineTerminator: false
}})
```



80 changes: 56 additions & 24 deletions src/scan/lexer.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import {Token, tokenName, ISimpleTokenOrIToken} from "./tokens_public"
import {TokenConstructor, ILexerDefinitionError, LexerDefinitionErrorType, Lexer, IMultiModeLexerDefinition} from "./lexer_public"
import {Token, tokenName, ISimpleTokenOrIToken, CustomPatternMatcherFunc} from "./tokens_public"
import {
TokenConstructor, ILexerDefinitionError, LexerDefinitionErrorType, Lexer, IMultiModeLexerDefinition,
IRegExpExec
} from "./lexer_public"
import {
reject,
indexOf,
map,
zipObject,
isString,
isUndefined,
reduce,
Expand All @@ -19,7 +21,8 @@ import {
uniq,
every,
keys,
isArray
isArray,
isFunction
} from "../utils/utils"
import {isLazyTokenType, isSimpleTokenType} from "./tokens"

Expand All @@ -28,7 +31,7 @@ export const DEFAULT_MODE = "defaultMode"
export const MODES = "modes"

export interface IAnalyzeResult {
allPatterns:RegExp[]
allPatterns:IRegExpExec[]
patternIdxToClass:Function[]
patternIdxToGroup:any[]
patternIdxToLongerAltIdx:number[]
Expand All @@ -38,22 +41,36 @@ export interface IAnalyzeResult {
emptyGroups:{ [groupName:string]:Token[] }
}

const CONTAINS_LINE_TERMINATOR = "containsLineTerminator"

export function analyzeTokenClasses(tokenClasses:TokenConstructor[]):IAnalyzeResult {

let onlyRelevantClasses = reject(tokenClasses, (currClass) => {
return currClass[PATTERN] === Lexer.NA
})

let allTransformedPatterns = map(onlyRelevantClasses, (currClass) => {
return addStartOfInput(currClass[PATTERN])
})
let currPattern = currClass[PATTERN]

let allPatternsToClass = zipObject(<any>allTransformedPatterns, onlyRelevantClasses)
if (isRegExp(currPattern)) {
return addStartOfInput(currPattern)
}
// CustomPatternMatcherFunc - custom patterns do not require any transformations, only wrapping in a RegExp Like object
else if (isFunction(currPattern)) {
return {exec: currPattern}
}
// ICustomPattern
else if (has(currPattern, "exec")) {
return currPattern
}
else {
throw Error("non exhaustive match")
}

let patternIdxToClass:any = map(allTransformedPatterns, (pattern) => {
return allPatternsToClass[pattern.toString()]
})

let patternIdxToClass = onlyRelevantClasses

let patternIdxToGroup = map(onlyRelevantClasses, (clazz:any) => {
let groupName = clazz.GROUP
if (groupName === Lexer.SKIPPED) {
Expand Down Expand Up @@ -84,8 +101,16 @@ export function analyzeTokenClasses(tokenClasses:TokenConstructor[]):IAnalyzeRes
let patternIdxToPopMode = map(onlyRelevantClasses, (clazz:any) => has(clazz, "POP_MODE"))

let patternIdxToCanLineTerminator = map(allTransformedPatterns, (pattern:RegExp) => {
// TODO: unicode escapes of line terminators too?
return /\\n|\\r|\\s/g.test(pattern.source)
if (isRegExp(pattern)) {
// TODO: unicode escapes of line terminators too?
return /\\n|\\r|\\s/g.test(pattern.source)
}
else {
if (has(pattern, CONTAINS_LINE_TERMINATOR)) {
return pattern[CONTAINS_LINE_TERMINATOR]
}
return false
}
})

let emptyGroups = reduce(onlyRelevantClasses, (acc, clazz:any) => {
Expand All @@ -112,18 +137,13 @@ export function validatePatterns(tokenClasses:TokenConstructor[], validModesName
let errors = []

let missingResult = findMissingPatterns(tokenClasses)
let validTokenClasses = missingResult.valid
errors = errors.concat(missingResult.errors)

let invalidResult = findInvalidPatterns(validTokenClasses)
validTokenClasses = invalidResult.valid
let invalidResult = findInvalidPatterns(missingResult.valid)
let validTokenClasses = invalidResult.valid
errors = errors.concat(invalidResult.errors)

errors = errors.concat(findEndOfInputAnchor(validTokenClasses))

errors = errors.concat(findUnsupportedFlags(validTokenClasses))

errors = errors.concat(findDuplicatePatterns(validTokenClasses))
errors = errors.concat(validateRegExpPattern(validTokenClasses))

errors = errors.concat(findInvalidGroupType(validTokenClasses))

Expand All @@ -132,6 +152,19 @@ export function validatePatterns(tokenClasses:TokenConstructor[], validModesName
return errors
}

function validateRegExpPattern(tokenClasses:TokenConstructor[]):ILexerDefinitionError[] {
let errors = []
let withRegExpPatterns = filter(tokenClasses, (currTokClass) => isRegExp(currTokClass[PATTERN]))

errors = errors.concat(findEndOfInputAnchor(withRegExpPatterns))

errors = errors.concat(findUnsupportedFlags(withRegExpPatterns))

errors = errors.concat(findDuplicatePatterns(withRegExpPatterns))

return errors
}

export interface ILexerFilterResult {
errors:ILexerDefinitionError[]
valid:TokenConstructor[]
Expand All @@ -157,12 +190,13 @@ export function findMissingPatterns(tokenClasses:TokenConstructor[]):ILexerFilte
export function findInvalidPatterns(tokenClasses:TokenConstructor[]):ILexerFilterResult {
let tokenClassesWithInvalidPattern = filter(tokenClasses, (currClass) => {
let pattern = currClass[PATTERN]
return !isRegExp(pattern)
return !isRegExp(pattern) && !isFunction(pattern) && !has(pattern, "exec")
})

let errors = map(tokenClassesWithInvalidPattern, (currClass) => {
return {
message: "Token class: ->" + tokenName(currClass) + "<- static 'PATTERN' can only be a RegExp",
message: "Token class: ->" + tokenName(currClass) + "<- static 'PATTERN' can only be a RegExp, a" +
" Function matching the {CustomPatternMatcherFunc} type or an Object matching the {ICustomPattern} interface.",
type: LexerDefinitionErrorType.INVALID_PATTERN,
tokenClasses: [currClass]
}
Expand Down Expand Up @@ -361,8 +395,6 @@ export function performRuntimeChecks(lexerDefinition:IMultiModeLexerDefinition):
})
}
})

// lexerDefinition.modes[currModeName] = reject<Function>(currModeValue, (currTokClass) => isUndefined(currTokClass))
})
}

Expand Down
43 changes: 33 additions & 10 deletions src/scan/lexer_public.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,21 @@
import {Token, LazyTokenCacheData, getImage, getStartLine, getStartColumn, ISimpleTokenOrIToken} from "./tokens_public"
import {
validatePatterns, analyzeTokenClasses, countLineTerminators, DEFAULT_MODE, performRuntimeChecks, checkLazyMode,
checkSimpleMode, cloneEmptyGroups
Token,
LazyTokenCacheData,
getImage,
getStartLine,
getStartColumn,
ISimpleTokenOrIToken,
CustomPatternMatcherFunc
} from "./tokens_public"
import {
validatePatterns,
analyzeTokenClasses,
countLineTerminators,
DEFAULT_MODE,
performRuntimeChecks,
checkLazyMode,
checkSimpleMode,
cloneEmptyGroups
} from "./lexer"
import {
cloneObj,
Expand All @@ -19,8 +33,13 @@ import {
mapValues
} from "../utils/utils"
import {
fillUpLineToOffset, getStartColumnFromLineToOffset, getStartLineFromLineToOffset, augmentTokenClasses,
createSimpleLazyToken, LazyTokenCreator, createLazyTokenInstance
fillUpLineToOffset,
getStartColumnFromLineToOffset,
getStartLineFromLineToOffset,
augmentTokenClasses,
createSimpleLazyToken,
LazyTokenCreator,
createLazyTokenInstance
} from "./tokens"

export interface TokenConstructor extends Function {
Expand Down Expand Up @@ -80,6 +99,10 @@ export interface IMultiModeLexerDefinition {
defaultMode:string
}

export interface IRegExpExec {
exec:CustomPatternMatcherFunc
}

export class Lexer {

public static SKIPPED = "This marks a skipped Token pattern, this means each token identified by it will" +
Expand All @@ -92,7 +115,7 @@ export class Lexer {
protected isSimpleTokenMode
protected modes:string[] = []
protected defaultMode:string
protected allPatterns:{ [modeName:string]:RegExp[] } = {}
protected allPatterns:{ [modeName:string]:IRegExpExec[] } = {}
protected patternIdxToClass:{ [modeName:string]:Function[] } = {}
protected patternIdxToGroup:{ [modeName:string]:string[] } = {}
protected patternIdxToLongerAltIdx:{ [modeName:string]:number[] } = {}
Expand Down Expand Up @@ -472,8 +495,8 @@ export class Lexer {
text = text.substr(1)
offset++
for (j = 0; j < currModePatterns.length; j++) {
foundResyncPoint = currModePatterns[j].test(text)
if (foundResyncPoint) {
foundResyncPoint = currModePatterns[j].exec(text)
if (foundResyncPoint !== null) {
break
}
}
Expand Down Expand Up @@ -609,8 +632,8 @@ export class Lexer {
text = text.substr(1)
offset++
for (j = 0; j < currModePatterns.length; j++) {
foundResyncPoint = currModePatterns[j].test(text)
if (foundResyncPoint) {
foundResyncPoint = currModePatterns[j].exec(text)
if (foundResyncPoint !== null) {
break
}
}
Expand Down
38 changes: 32 additions & 6 deletions src/scan/tokens_public.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import {isString, isRegExp, isFunction, isUndefined, assignNoOverwrite, has} from "../utils/utils"
import {functionName, defineNameProp} from "../lang/lang_extensions"
import {Lexer, TokenConstructor} from "./lexer_public"
import {Lexer, TokenConstructor, IRegExpExec} from "./lexer_public"
import {
isInheritanceBasedToken,
getStartLineFromLazyToken,
Expand All @@ -14,11 +14,39 @@ import {
augmentTokenClasses
} from "./tokens"

/**
* The type of custom pattern matcher functions.
* Matches should only be done on the start of the text.
* Note that this is identical to the signature of RegExp.prototype.exec
*
* This should behave as if the regExp match is using a start of input anchor.
* So: for example if a custom matcher is implemented for Tokens matching: /\w+/
* The implementation of the custom matcher must implement a custom matcher for /^\w+/.
*/
export type CustomPatternMatcherFunc = (test:string) => RegExpExecArray

/**
* Interface for custom user provided token pattern matchers.
*/
export interface ICustomPattern {
/**
* The custom pattern implementation.
* @see CustomPatternMatcherFunc
*/
exec:CustomPatternMatcherFunc
/**
* Flag indicating if this custom pattern may contain line terminators.
* This is required to avoid errors in the line/column numbering.
* @default false - if this property was not explicitly defined.
*/
containsLineTerminator?:boolean
}

/**
* This can be used to improve the quality/readability of error messages or syntax diagrams.
*
* @param {Function} clazz - A constructor for a Token subclass
* @returns {string} - The Human readable label a Token if it exists.
* @returns {string} - The Human readable label for a Token if it exists.
*/
export function tokenLabel(clazz:Function):string {
if (hasTokenLabel(clazz)) {
Expand Down Expand Up @@ -47,12 +75,11 @@ export function tokenName(clazz:Function):string {
}
}

// TODO: uppper or lower case name? or support both???
export interface ITokenConfig {
name:string
parent?:TokenConstructor
label?:string
pattern?:RegExp
pattern?:RegExp | CustomPatternMatcherFunc | ICustomPattern
group?:string|any
push_mode?:string
pop_mode?:boolean
Expand All @@ -67,7 +94,6 @@ const POP_MODE = "pop_mode"
const LONGER_ALT = "longer_alt"

/**
*
* @param {ITokenConfig} config - The configuration for
* @returns {TokenConstructor} - A constructor for the new Token subclass
*/
Expand Down Expand Up @@ -128,7 +154,7 @@ export function extendSimpleLazyToken(tokenName:string, patternOrParent:any = un
* extend and create Token subclasses in a less verbose manner
*
* @param {string} tokenName - The name of the new TokenClass
* @param {RegExp|Function} patternOrParent - RegExp Pattern or Parent Token Constructor
* @param {RegExp|CustomPatternMatcherFunc|Function} patternOrParent - RegExp Pattern or Parent Token Constructor
* @param {Function} parentConstructor - The Token class to be extended
* @returns {Function} - A constructor for the new extended Token subclass
*/
Expand Down
5 changes: 4 additions & 1 deletion src/utils/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,10 @@ export function pick(obj:Object, predicate:(item) => boolean) {
}

export function has(obj:any, prop:string):boolean {
return obj.hasOwnProperty(prop)
if (isObject(obj)) {
return obj.hasOwnProperty(prop)
}
return false
}

export function contains<T>(arr:T[], item):boolean {
Expand Down
Loading

0 comments on commit d9ef5dc

Please sign in to comment.