Support supplementary CPs in Unicode identifiers (#2522)

AssemblyScript · Sep 26, 2022 · f8a775f · f8a775f
1 parent 3135e7e
commit f8a775f
Show file tree

Hide file tree

Showing 6 changed files with 457 additions and 169 deletions.
diff --git a/scripts/unicode-identifier.js b/scripts/unicode-identifier.js
@@ -0,0 +1,42 @@
+// see https://github.com/microsoft/TypeScript/blob/main/scripts/regenerate-unicode-identifier-parts.js
+
+const MAX_UNICODE_CODEPOINT = 0x10FFFF;
+const isStart = c => /[\p{ID_Start}\u{2118}\u{212E}\u{309B}\u{309C}]/u.test(c); // Other_ID_Start explicitly included for back compat - see http://www.unicode.org/reports/tr31/#Introduction
+const isPart = c => /[\p{ID_Continue}\u{00B7}\u{0387}\u{19DA}\u{1369}\u{136A}\u{136B}\u{136C}\u{136D}\u{136E}\u{136F}\u{1370}\u{1371}]/u.test(c) || isStart(c); // Likewise for Other_ID_Continue
+const parts = [];
+let partsActive = false;
+let startsActive = false;
+const starts = [];
+
+// Skip 0-9 (48..57), A-Z (65..90), a-z (97..122) - checked otherwise
+for (let cp = 123; cp <= MAX_UNICODE_CODEPOINT; cp++) {
+  if (isStart(String.fromCodePoint(cp)) !== startsActive) {
+    starts.push(cp - +startsActive);
+    startsActive = !startsActive;
+  }
+  if (isPart(String.fromCodePoint(cp)) !== partsActive) {
+    parts.push(cp - +partsActive);
+    partsActive = !partsActive;
+  }
+}
+if (startsActive) starts.push(MAX_UNICODE_CODEPOINT);
+if (partsActive) parts.push(MAX_UNICODE_CODEPOINT);
+
+function tablify(cps) {
+  let sb = ["/*\n| from  ...  to | from  ...  to | from  ...  to | from  ...  to |*/"];
+  let i = 0;
+  while (i < cps.length) {
+    if (!(i % 8)) sb.push("\n  ");
+    sb.push(`${cps[i++].toString().padEnd(6)}, `);
+  }
+  return sb.join("") + "\n";
+}
+
+console.log(`/** Unicode ${process.versions.unicode} ID_Start/Other_ID_Start ranges */`);
+console.log(`const unicodeIdentifierStart: i32[] = [${tablify(starts)}];`);
+console.log(`const unicodeIdentifierStartMin = ${starts[0]};`);
+console.log(`const unicodeIdentifierStartMax = ${starts[starts.length - 1]};\n`);
+console.log(`/** Unicode ${process.versions.unicode} ID_Continue/Other_ID_Continue + ID_Start/Other_ID_Start ranges*/`);
+console.log(`const unicodeIdentifierPart: i32[] = [${tablify(parts)}];`);
+console.log(`const unicodeIdentifierPartMin = ${parts[0]};`);
+console.log(`const unicodeIdentifierPartMax = ${parts[parts.length - 1]};\n`);
diff --git a/src/diagnostics.ts b/src/diagnostics.ts
@@ -267,7 +267,7 @@ function formatDiagnosticContext(range: Range): string {
   var lineSpace = " ".repeat(lineNumber.length);
   // Find preceeding line break
   while (start > 0 && !isLineBreak(text.charCodeAt(start - 1))) start--;
-  // Skip leading whitespace
+  // Skip leading whitespace (assume no supplementary whitespaces)
   while (start < len && isWhiteSpace(text.charCodeAt(start))) start++;
   // Find next line break
   while (end < len && !isLineBreak(text.charCodeAt(end))) end++;

diff --git a/src/tokenizer.ts b/src/tokenizer.ts
@@ -33,7 +33,8 @@ import {
   isOctal,
   isHexBase,
   isHighSurrogate,
-  isLowSurrogate
+  combineSurrogates,
+  numCodeUnits
 } from "./util";
 
 /** Named token types. */
@@ -913,11 +914,15 @@ export class Tokenizer extends DiagnosticEmitter {
           return Token.AT;
         }
         default: {
+          // Unicode-aware from here on
+          if (isHighSurrogate(c) && pos + 1 < end) {
+            c = combineSurrogates(c, text.charCodeAt(pos + 1));
+          }
           if (isIdentifierStart(c)) {
             let posBefore = pos;
             while (
-              ++pos < end &&
-              isIdentifierPart(c = text.charCodeAt(pos))
+              (pos += numCodeUnits(c)) < end &&
+              isIdentifierPart(c = <i32>text.codePointAt(pos))
             ) { /* nop */ }
             if (identifierHandling != IdentifierHandling.ALWAYS) {
               let maybeKeywordToken = tokenFromKeyword(text.substring(posBefore, pos));
@@ -935,14 +940,11 @@ export class Tokenizer extends DiagnosticEmitter {
             this.pos = posBefore;
             return Token.IDENTIFIER;
           } else if (isWhiteSpace(c)) {
-            ++pos;
+            ++pos; // assume no supplementary whitespaces
             break;
           }
-          let start = pos++;
-          if (
-            isHighSurrogate(c) && pos < end &&
-            isLowSurrogate(text.charCodeAt(pos))
-          ) ++pos;
+          let start = pos;
+          pos += numCodeUnits(c);
           this.error(
             DiagnosticCode.Invalid_character,
             this.range(start, pos)
@@ -1055,9 +1057,11 @@ export class Tokenizer extends DiagnosticEmitter {
     var end = this.end;
     var pos = this.pos;
     var start = pos;
+    var c = <i32>text.codePointAt(pos);
+    assert(isIdentifierStart(c));
     while (
-      ++pos < end &&
-      isIdentifierPart(text.charCodeAt(pos))
+      (pos += numCodeUnits(c)) < end &&
+      isIdentifierPart(c = <i32>text.codePointAt(pos))
     );
     this.pos = pos;
     return text.substring(start, pos);