Skip to content

Commit e56abf7

Browse files
authored
Refactor text util surrogate helpers (AssemblyScript#2146)
1 parent 30c7118 commit e56abf7

File tree

3 files changed

+68
-19
lines changed

3 files changed

+68
-19
lines changed

src/module.ts

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,13 @@
1010

1111
import { BuiltinNames } from "./builtins";
1212
import { Target } from "./common";
13+
import {
14+
isHighSurrogate,
15+
isLowSurrogate,
16+
combineSurrogates,
17+
SURROGATE_HIGH,
18+
SURROGATE_LOW
19+
} from "./util";
1320
import * as binaryen from "./glue/binaryen";
1421

1522
/** A Binaryen-compatible index. */
@@ -3108,8 +3115,8 @@ function stringLengthUTF8(str: string): usize {
31083115
} else if (c1 <= 0x7FF) {
31093116
len += 2;
31103117
} else if (
3111-
(c1 & 0xFC00) === 0xD800 && i + 1 < k &&
3112-
(str.charCodeAt(i + 1) & 0xFC00) === 0xDC00
3118+
isHighSurrogate(c1) && i + 1 < k &&
3119+
isLowSurrogate(str.charCodeAt(i + 1))
31133120
) {
31143121
i++;
31153122
len += 4;
@@ -3146,10 +3153,10 @@ function allocString(str: string | null): usize {
31463153
binaryen.__i32_store8(idx++, (0xC0 | (c1 >>> 6) ) as u8);
31473154
binaryen.__i32_store8(idx++, (0x80 | ( c1 & 63)) as u8);
31483155
} else if (
3149-
(c1 & 0xFC00) === 0xD800 && i + 1 < k &&
3150-
((c2 = str.charCodeAt(i + 1)) & 0xFC00) === 0xDC00
3156+
isHighSurrogate(c1) && i + 1 < k &&
3157+
isLowSurrogate(c2 = str.charCodeAt(i + 1))
31513158
) {
3152-
c1 = 0x10000 + ((c1 & 0x3FF) << 10) | (c2 & 0x3FF);
3159+
c1 = combineSurrogates(c1, c2);
31533160
++i;
31543161
binaryen.__i32_store8(idx++, (0xF0 | (c1 >>> 18) ) as u8);
31553162
binaryen.__i32_store8(idx++, (0x80 | ((c1 >>> 12) & 63)) as u8);
@@ -3209,10 +3216,11 @@ export function readString(ptr: usize): string | null {
32093216
arr.push(cp);
32103217
} else {
32113218
let ch = cp - 0x10000;
3212-
arr.push(0xD800 | (ch >>> 10));
3213-
arr.push(0xDC00 | (ch & 0x3FF));
3219+
arr.push(SURROGATE_HIGH | (ch >>> 10));
3220+
arr.push(SURROGATE_LOW | (ch & 0x3FF));
32143221
}
32153222
}
3223+
// TODO: implement and use String.fromCodePoints
32163224
return String.fromCharCodes(arr);
32173225
}
32183226

src/tokenizer.ts

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,9 @@ import {
2929
isIdentifierStart,
3030
isIdentifierPart,
3131
isDecimal,
32-
isOctal
32+
isOctal,
33+
isHighSurrogate,
34+
isLowSurrogate
3335
} from "./util";
3436

3537
/** Named token types. */
@@ -976,9 +978,9 @@ export class Tokenizer extends DiagnosticEmitter {
976978
break;
977979
}
978980
let start = pos++;
979-
if ( // surrogate pair?
980-
(c & 0xFC00) == 0xD800 && pos < end &&
981-
((text.charCodeAt(pos)) & 0xFC00) == 0xDC00
981+
if (
982+
isHighSurrogate(c) && pos < end &&
983+
isLowSurrogate(text.charCodeAt(pos))
982984
) ++pos;
983985
this.error(
984986
DiagnosticCode.Invalid_character,
@@ -1216,7 +1218,7 @@ export class Tokenizer extends DiagnosticEmitter {
12161218
case CharCode.LINEFEED:
12171219
case CharCode.LINESEPARATOR:
12181220
case CharCode.PARAGRAPHSEPARATOR: return "";
1219-
default: return String.fromCharCode(c);
1221+
default: return String.fromCodePoint(c);
12201222
}
12211223
}
12221224

@@ -1677,7 +1679,7 @@ export class Tokenizer extends DiagnosticEmitter {
16771679
return "";
16781680
}
16791681
this.pos = pos;
1680-
return String.fromCharCode(value);
1682+
return String.fromCodePoint(value);
16811683
}
16821684

16831685
checkForIdentifierStartAfterNumericLiteral(): void {
@@ -1739,12 +1741,7 @@ export class Tokenizer extends DiagnosticEmitter {
17391741
? text.substring(startIfTaggedTemplate, this.pos)
17401742
: "";
17411743
}
1742-
return value32 < 0x10000
1743-
? String.fromCharCode(value32)
1744-
: String.fromCharCode(
1745-
((value32 - 0x10000) >>> 10) | 0xD800,
1746-
((value32 - 0x10000) & 1023) | 0xDC00
1747-
);
1744+
return String.fromCodePoint(value32);
17481745
}
17491746
}
17501747

src/util/text.ts

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,50 @@ export function isWhiteSpace(c: i32): bool {
175175
}
176176
}
177177

178+
/** First high (lead) surrogate. */
179+
export const SURROGATE_HIGH = 0xD800;
180+
181+
/** First low (trail) surrogate. */
182+
export const SURROGATE_LOW = 0xDC00;
183+
184+
/** Tests if a code unit or code point is a surrogate. */
185+
export function isSurrogate(c: i32): bool {
186+
// F800: 11111 0 0000000000 Mask
187+
// D800: 11011 X XXXXXXXXXX Any surrogate
188+
return (c & 0xF800) == SURROGATE_HIGH;
189+
}
190+
191+
/** Tests if a surrogate is a high (lead) surrogate. */
192+
export function isSurrogateHigh(c: i32): bool {
193+
// D800-DBFF
194+
return c < SURROGATE_LOW;
195+
}
196+
197+
/** Tests if a surrogate is a low (trail) surrogate. */
198+
export function isSurrogateLow(c: i32): bool {
199+
// DC00-DFFF
200+
return c >= SURROGATE_LOW;
201+
}
202+
203+
/** Tests if a code unit or code point is a high (lead) surrogate. */
204+
export function isHighSurrogate(c: i32): bool {
205+
// FC00: 11111 1 0000000000 Mask
206+
// D800: 11011 0 XXXXXXXXXX High/Lead surrogate
207+
return (c & 0xFC00) == SURROGATE_HIGH;
208+
}
209+
210+
/** Tests if a code unit or code point is a low (trail) surrogate. */
211+
export function isLowSurrogate(c: i32): bool {
212+
// FC00: 11111 1 0000000000 Mask
213+
// DC00: 11011 1 XXXXXXXXXX Low/Trail surrogate
214+
return (c & 0xFC00) == SURROGATE_LOW;
215+
}
216+
217+
/** Converts a surrogate pair to its respective code point. */
218+
export function combineSurrogates(hi: i32, lo: i32): i32 {
219+
return 0x10000 + ((hi & 0x3FF) << 10) | (lo & 0x3FF);
220+
}
221+
178222
export function isAlpha(c: i32): bool {
179223
let c0 = c | 32; // unify uppercases and lowercases a|A - z|Z
180224
return c0 >= CharCode.a && c0 <= CharCode.z;

0 commit comments

Comments
 (0)