Skip to content

Commit

Permalink
Support supplementary CPs in Unicode identifiers (#2522)
Browse files Browse the repository at this point in the history
  • Loading branch information
dcodeIO committed Sep 26, 2022
1 parent 3135e7e commit f8a775f
Show file tree
Hide file tree
Showing 6 changed files with 457 additions and 169 deletions.
42 changes: 42 additions & 0 deletions scripts/unicode-identifier.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
// see https://github.com/microsoft/TypeScript/blob/main/scripts/regenerate-unicode-identifier-parts.js

const MAX_UNICODE_CODEPOINT = 0x10FFFF;
const isStart = c => /[\p{ID_Start}\u{2118}\u{212E}\u{309B}\u{309C}]/u.test(c); // Other_ID_Start explicitly included for back compat - see http://www.unicode.org/reports/tr31/#Introduction
const isPart = c => /[\p{ID_Continue}\u{00B7}\u{0387}\u{19DA}\u{1369}\u{136A}\u{136B}\u{136C}\u{136D}\u{136E}\u{136F}\u{1370}\u{1371}]/u.test(c) || isStart(c); // Likewise for Other_ID_Continue
const parts = [];
let partsActive = false;
let startsActive = false;
const starts = [];

// Skip 0-9 (48..57), A-Z (65..90), a-z (97..122) - checked otherwise
for (let cp = 123; cp <= MAX_UNICODE_CODEPOINT; cp++) {
if (isStart(String.fromCodePoint(cp)) !== startsActive) {
starts.push(cp - +startsActive);
startsActive = !startsActive;
}
if (isPart(String.fromCodePoint(cp)) !== partsActive) {
parts.push(cp - +partsActive);
partsActive = !partsActive;
}
}
if (startsActive) starts.push(MAX_UNICODE_CODEPOINT);
if (partsActive) parts.push(MAX_UNICODE_CODEPOINT);

function tablify(cps) {
let sb = ["/*\n| from ... to | from ... to | from ... to | from ... to |*/"];
let i = 0;
while (i < cps.length) {
if (!(i % 8)) sb.push("\n ");
sb.push(`${cps[i++].toString().padEnd(6)}, `);
}
return sb.join("") + "\n";
}

console.log(`/** Unicode ${process.versions.unicode} ID_Start/Other_ID_Start ranges */`);
console.log(`const unicodeIdentifierStart: i32[] = [${tablify(starts)}];`);
console.log(`const unicodeIdentifierStartMin = ${starts[0]};`);
console.log(`const unicodeIdentifierStartMax = ${starts[starts.length - 1]};\n`);
console.log(`/** Unicode ${process.versions.unicode} ID_Continue/Other_ID_Continue + ID_Start/Other_ID_Start ranges*/`);
console.log(`const unicodeIdentifierPart: i32[] = [${tablify(parts)}];`);
console.log(`const unicodeIdentifierPartMin = ${parts[0]};`);
console.log(`const unicodeIdentifierPartMax = ${parts[parts.length - 1]};\n`);
2 changes: 1 addition & 1 deletion src/diagnostics.ts
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ function formatDiagnosticContext(range: Range): string {
var lineSpace = " ".repeat(lineNumber.length);
// Find preceeding line break
while (start > 0 && !isLineBreak(text.charCodeAt(start - 1))) start--;
// Skip leading whitespace
// Skip leading whitespace (assume no supplementary whitespaces)
while (start < len && isWhiteSpace(text.charCodeAt(start))) start++;
// Find next line break
while (end < len && !isLineBreak(text.charCodeAt(end))) end++;
Expand Down
26 changes: 15 additions & 11 deletions src/tokenizer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ import {
isOctal,
isHexBase,
isHighSurrogate,
isLowSurrogate
combineSurrogates,
numCodeUnits
} from "./util";

/** Named token types. */
Expand Down Expand Up @@ -913,11 +914,15 @@ export class Tokenizer extends DiagnosticEmitter {
return Token.AT;
}
default: {
// Unicode-aware from here on
if (isHighSurrogate(c) && pos + 1 < end) {
c = combineSurrogates(c, text.charCodeAt(pos + 1));
}
if (isIdentifierStart(c)) {
let posBefore = pos;
while (
++pos < end &&
isIdentifierPart(c = text.charCodeAt(pos))
(pos += numCodeUnits(c)) < end &&
isIdentifierPart(c = <i32>text.codePointAt(pos))
) { /* nop */ }
if (identifierHandling != IdentifierHandling.ALWAYS) {
let maybeKeywordToken = tokenFromKeyword(text.substring(posBefore, pos));
Expand All @@ -935,14 +940,11 @@ export class Tokenizer extends DiagnosticEmitter {
this.pos = posBefore;
return Token.IDENTIFIER;
} else if (isWhiteSpace(c)) {
++pos;
++pos; // assume no supplementary whitespaces
break;
}
let start = pos++;
if (
isHighSurrogate(c) && pos < end &&
isLowSurrogate(text.charCodeAt(pos))
) ++pos;
let start = pos;
pos += numCodeUnits(c);
this.error(
DiagnosticCode.Invalid_character,
this.range(start, pos)
Expand Down Expand Up @@ -1055,9 +1057,11 @@ export class Tokenizer extends DiagnosticEmitter {
var end = this.end;
var pos = this.pos;
var start = pos;
var c = <i32>text.codePointAt(pos);
assert(isIdentifierStart(c));
while (
++pos < end &&
isIdentifierPart(text.charCodeAt(pos))
(pos += numCodeUnits(c)) < end &&
isIdentifierPart(c = <i32>text.codePointAt(pos))
);
this.pos = pos;
return text.substring(start, pos);
Expand Down

0 comments on commit f8a775f

Please sign in to comment.