From d92d3c35a097e83a0081678d6c8679340fcea8cc Mon Sep 17 00:00:00 2001 From: Colin E Date: Sat, 20 Feb 2021 12:02:42 +0000 Subject: [PATCH] fix: fixed small bug with case insensitive range matching --- assembly/__spec_tests__/generated.spec.ts | 21 ++++++++++---- assembly/__tests__/character-sets.spec.ts | 9 ++++++ assembly/__tests__/utils.ts | 16 ++++++++--- assembly/nfa/matcher.ts | 34 ++++++++++++++++++----- assembly/util.ts | 14 ++++++++++ spec/test-generator.js | 12 ++++---- ts/index.ts | 8 ++---- 7 files changed, 84 insertions(+), 30 deletions(-) diff --git a/assembly/__spec_tests__/generated.spec.ts b/assembly/__spec_tests__/generated.spec.ts index 95c6422..0989a34 100644 --- a/assembly/__spec_tests__/generated.spec.ts +++ b/assembly/__spec_tests__/generated.spec.ts @@ -776,7 +776,11 @@ it("line: 152 - matches ([\\da-f:]+)$ against 'abc'", () => { expect(match.matches[0]).toBe("abc".substring(0, 3)); expect(match.matches[1]).toBe("abc".substring(0, 3)); }); -xit("line: 153 - aspect [Actual]: null vs [Expected]: Not null issue", () => {}); +it("line: 153 - matches ([\\da-f:]+)$ against 'fed'", () => { + const match = exec("([\\da-f:]+)$", "fed", "is"); + expect(match.matches[0]).toBe("fed".substring(0, 3)); + expect(match.matches[1]).toBe("fed".substring(0, 3)); +}); it("line: 154 - matches ([\\da-f:]+)$ against 'E'", () => { const match = exec("([\\da-f:]+)$", "E", "is"); expect(match.matches[0]).toBe("E".substring(0, 1)); @@ -1044,8 +1048,8 @@ xit("line: 199 - non capturing groups not supported", () => {}); xit("line: 200 - non capturing groups not supported", () => {}); xit("line: 201 - non capturing groups not supported", () => {}); xit("line: 202 - non capturing groups not supported", () => {}); -xit("line: 203 - aspect [Actual]: null vs [Expected]: Not null issue", () => {}); -xit("line: 204 - aspect [Actual]: null vs [Expected]: Not null issue", () => {}); +xit("line: 203 - test appears to be incorrect?", () => {}); +xit("line: 204 - test appears to be incorrect?", () => {}); it("line: 205 - matches ^ a\\ b[c ]d $ against 'abcd'", () => { expectNotMatch("^ a\\ b[c ]d $", ["abcd"]); }); @@ -1352,7 +1356,7 @@ it("line: 1083 - matches ^[ab]{1,3}(ab*?|b) against 'The quick brown fox'", () = xit("line: 1084 - back references are not supported", () => {}); xit("line: 1085 - back references are not supported", () => {}); xit("line: 1086 - test encoding issue", () => {}); -xit("line: 1087 - requires triage", () => {}); +xit("line: 1087 - test requires a substring function", () => {}); xit("line: 1088 - requires triage", () => {}); it("line: 1089 - matches abc\\x0def\\x00pqr\\x000xyz\\x0000AB against 'abc456 abc\x0def\x00pqr\x000xyz\x0000ABCDE'", () => { const match = exec( @@ -1506,8 +1510,13 @@ it("line: 1144 - matches ^[W-c]+$ against 'WXY_^abc'", () => { const match = exec("^[W-c]+$", "WXY_^abc", "s"); expect(match.matches[0]).toBe("WXY_^abc".substring(0, 8)); }); -xit("line: 1145 - as-pect test issue", () => {}); -xit("line: 1146 - as-pect test issue", () => {}); +it("line: 1145 - matches ^[W-c]+$ against 'wxy'", () => { + expectNotMatch("^[W-c]+$", ["wxy"]); +}); +it("line: 1146 - matches ^[W-c]+$ against 'WXY_^abc'", () => { + const match = exec("^[W-c]+$", "WXY_^abc", "is"); + expect(match.matches[0]).toBe("WXY_^abc".substring(0, 8)); +}); xit("line: 1147 - requires triage", () => {}); xit("line: 1148 - requires triage", () => {}); xit("line: 1149 - requires triage", () => {}); diff --git a/assembly/__tests__/character-sets.spec.ts b/assembly/__tests__/character-sets.spec.ts index 47f5955..aeb7cf1 100644 --- a/assembly/__tests__/character-sets.spec.ts +++ b/assembly/__tests__/character-sets.spec.ts @@ -43,3 +43,12 @@ it("treats - as a literal in negated sets", () => { expectNotMatch("[^-abc]", ["-", "a", "b", "c"]); expectMatch("[^-abc]", ["1", "A"]); }); + +it("supports case insensitive matching", () => { + // simple ranges + expectMatch("[a-c]", ["A", "C", "a", "c"], "i"); + expectNotMatch("[a-c]", ["D", "d"], "i"); + // complex + expectMatch("[W-c]", ["W", "w", "C", "c"], "i"); + expectNotMatch("[W-c]", ["V", "v", "D", "d"], "i"); +}); diff --git a/assembly/__tests__/utils.ts b/assembly/__tests__/utils.ts index 224daee..d264a23 100644 --- a/assembly/__tests__/utils.ts +++ b/assembly/__tests__/utils.ts @@ -1,7 +1,11 @@ import { RegExp, Match } from ".."; -export function expectMatch(regex: string, arr: string[]): void { - let regexp = new RegExp(regex); +export function expectMatch( + regex: string, + arr: string[], + flags: string = "" +): void { + let regexp = new RegExp(regex, flags); for (let i = 0; i < arr.length; i++) { const value = arr[i]; const match = exec(regexp, value); @@ -9,8 +13,12 @@ export function expectMatch(regex: string, arr: string[]): void { } } -export function expectNotMatch(regex: string, arr: string[]): void { - let regexp = new RegExp(regex); +export function expectNotMatch( + regex: string, + arr: string[], + flags: string = "" +): void { + let regexp = new RegExp(regex, flags); for (let i = 0; i < arr.length; i++) { const match = regexp.exec(arr[i]); expect(match).toBeNull( diff --git a/assembly/nfa/matcher.ts b/assembly/nfa/matcher.ts index a392601..b739bc4 100644 --- a/assembly/nfa/matcher.ts +++ b/assembly/nfa/matcher.ts @@ -8,6 +8,7 @@ import { NodeType, } from "../parser/node"; import { Flags } from "../regexp"; +import { Range } from "../util"; const enum MatcherType { Character, @@ -36,7 +37,10 @@ export class Matcher { node: CharacterRangeNode, flags: Flags ): CharacterRangeMatcher { - return new CharacterRangeMatcher(node.from, node.to, flags.ignoreCase); + return new CharacterRangeMatcher( + new Range(node.from, node.to), + flags.ignoreCase + ); } static fromCharacterSetNode( @@ -89,20 +93,36 @@ export class CharacterMatcher extends Matcher { } } +const LOWERCASE_LETTERS = new Range(Char.a, Char.z); +const UPPERCASE_LETTERS = new Range(Char.A, Char.Z); +const UPPER_LOWER_OFFSET = Char.a - Char.A; + export class CharacterRangeMatcher extends Matcher { - constructor(private from: u32, private to: u32, private ignoreCase: bool) { + private ranges: Range[]; + + constructor(private range: Range, ignoreCase: bool) { super(MatcherType.CharacterRange); + this.ranges = [range]; + if (ignoreCase) { - this.from |= 0x20; - this.to |= 0x20; + const lowerIntersect = range.intersection(LOWERCASE_LETTERS); + if (lowerIntersect) { + this.ranges.push(lowerIntersect.offset(-UPPER_LOWER_OFFSET)); + } + const upperIntersect = range.intersection(UPPERCASE_LETTERS); + if (upperIntersect) { + this.ranges.push(upperIntersect.offset(UPPER_LOWER_OFFSET)); + } } } matches(code: u32): bool { - if (this.ignoreCase) { - code |= 0x20; + for (let i = 0, len = this.ranges.length; i < len; i++) { + if (code >= u32(this.ranges[i].from) && code <= u32(this.ranges[i].to)) { + return true; + } } - return code >= this.from && code <= this.to; + return false; } } diff --git a/assembly/util.ts b/assembly/util.ts index 8796260..9215e04 100644 --- a/assembly/util.ts +++ b/assembly/util.ts @@ -11,3 +11,17 @@ export function replaceAtIndex(arr: T[], index: u32, item: T): T[] { unchecked((res[index] = item)); return res; } + +export class Range { + constructor(public from: i32, public to: i32) {} + + intersection(other: Range): Range | null { + const lower = i32(Math.max(this.from, other.from)); + const upper = i32(Math.min(this.to, other.to)); + return lower < upper ? new Range(lower, upper) : null; + } + + offset(value: i32): Range { + return new Range(this.from + value, this.to + value); + } +} diff --git a/spec/test-generator.js b/spec/test-generator.js index 3bcd45a..dfc32eb 100644 --- a/spec/test-generator.js +++ b/spec/test-generator.js @@ -21,8 +21,11 @@ const knownIssues = { 1288, ], "test contains an octal escape sequence": [1102], + // the test results measure captured groups using character length / locations + // see: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/length + // this is tricky to reproduce + "test requires a substring function": [1087], "requires triage": [ - 1087, 1363, 1369, 1163, @@ -32,15 +35,10 @@ const knownIssues = { 1413, ...range(1301, 1308), ], - "as-pect test issue": [1145, 1146], "test indicates a malformed regex, whereas it appears OK in JS": [1189], "test regex contains syntax not supported in JS": [82, 1158, 281], "the test behaviour differs between PCRE and JS": [290], - "aspect [Actual]: null vs [Expected]: Not null issue": [ - 153, - 203, - 204, - ], + "test appears to be incorrect?": [203, 204], }; const hasKnownIssue = (index) => { diff --git a/ts/index.ts b/ts/index.ts index 56f58b6..89080e1 100644 --- a/ts/index.ts +++ b/ts/index.ts @@ -5,10 +5,6 @@ globalAny.log = console.log; import { RegExp } from "../assembly/regexp"; -const regexObj = new RegExp("^(a){1,3}"); -const match = regexObj.exec("abc"); +const regexObj = new RegExp("[a-c]", "i"); +const match = regexObj.exec("A"); console.log(JSON.stringify(match, null, 2)); - -const regexObj2 = new RegExp("(a|b)c|a(b|c)"); -const match2 = regexObj2.exec("ab"); -console.log(JSON.stringify(match2, null, 2));